From 67adc3ad67ff12fb08ba1ec8124bbdfb03a7b68a Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Tue, 15 Apr 2025 16:55:37 +0100
Subject: [PATCH 1/7] Fix bio2zarr import

---
 tests/test_stats.py      | 4 ++--
 tests/test_vcf_writer.py | 4 ++--
 tests/utils.py           | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/test_stats.py b/tests/test_stats.py
index 7a28c3f..9f1b9c3 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -3,7 +3,7 @@
 
 import pytest
 import zarr
-from bio2zarr import vcf2zarr
+from bio2zarr import vcf
 
 from vcztools.stats import nrecords, stats
 
@@ -39,7 +39,7 @@ def test_stats__no_index(tmp_path):
     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
     # don't use cache here since we want to make sure vcz is not indexed
     vcz = tmp_path.joinpath("intermediate.vcz")
-    vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False)
+    vcf.convert([original], vcz, worker_processes=0, local_alleles=False)
 
     # delete the index created by vcf2zarr
     root = zarr.open(vcz, mode="a")
diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py
index 6ec9619..a00f5e9 100644
--- a/tests/test_vcf_writer.py
+++ b/tests/test_vcf_writer.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pytest
 import zarr
-from bio2zarr import vcf2zarr
+from bio2zarr import vcf
 from cyvcf2 import VCF
 from numpy.testing import assert_array_equal
 
@@ -305,7 +305,7 @@ def test_write_vcf__generate_header(tmp_path):
     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
     # don't use cache here since we mutate the vcz
     vcz = tmp_path.joinpath("intermediate.vcz")
-    vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False)
+    vcf.convert([original], vcz, worker_processes=0, local_alleles=False)
 
     # remove vcf_header
     root = zarr.open(vcz, mode="r+")
diff --git a/tests/utils.py b/tests/utils.py
index 6eb5c29..f7fda49 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -5,7 +5,7 @@
 
 import cyvcf2
 import numpy as np
-from bio2zarr import vcf2zarr
+from bio2zarr import vcf
 
 
 @contextmanager
@@ -145,7 +145,7 @@ def vcz_path_cache(vcf_path):
     cached_vcz_path = (cache_path / vcf_path.name).with_suffix(".vcz")
     if not cached_vcz_path.exists():
         if vcf_path.name.startswith("chr22"):
-            vcf2zarr.convert(
+            vcf.convert(
                 [vcf_path],
                 cached_vcz_path,
                 worker_processes=0,
@@ -153,7 +153,7 @@ def vcz_path_cache(vcf_path):
                 samples_chunk_size=10,
             )
         else:
-            vcf2zarr.convert(
+            vcf.convert(
                 [vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False
             )
     return cached_vcz_path

From 3079a413f52c940102d52d9205972418d750f6ba Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Wed, 16 Apr 2025 10:14:40 +0100
Subject: [PATCH 2/7] Don't use original VCF header in vcz

---
 vcztools/vcf_writer.py | 49 ++++++------------------------------------
 1 file changed, 6 insertions(+), 43 deletions(-)

diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py
index c402880..ed84f3e 100644
--- a/vcztools/vcf_writer.py
+++ b/vcztools/vcf_writer.py
@@ -1,6 +1,5 @@
 import io
 import logging
-import re
 import sys
 from datetime import datetime
 
@@ -144,11 +143,9 @@ def write_vcf(
         )
 
         if not no_header:
-            original_header = root.attrs.get("vcf_header", None)
             force_ac_an_header = not drop_genotypes and samples_selection is not None
             vcf_header = _generate_header(
                 root,
-                original_header,
                 sample_ids,
                 no_version=no_version,
                 force_ac_an=force_ac_an_header,
@@ -299,7 +296,6 @@ def c_chunk_to_vcf(
 
 def _generate_header(
     ds,
-    original_header,
     sample_ids,
     *,
     no_version: bool = False,
@@ -339,45 +335,12 @@ def _generate_header(
             if key in ("genotype", "genotype_phased"):
                 continue
             format_fields.append(key)
-    if original_header is None:  # generate entire header
-        # [1.4.1 File format]
-        print("##fileformat=VCFv4.3", file=output)
-
-        if "source" in ds.attrs:
-            print(f'##source={ds.attrs["source"]}', file=output)
-
-    else:  # use original header fields where appropriate
-        unstructured_pattern = re.compile("##([^=]+)=([^<].*)")
-        structured_pattern = re.compile("##([^=]+)=(<.*)")
-
-        for line in original_header.split("\n"):
-            if re.fullmatch(unstructured_pattern, line):
-                print(line, file=output)
-            else:
-                match = re.fullmatch(structured_pattern, line)
-                if match:
-                    category = match.group(1)
-                    id_pattern = re.compile("ID=([^,>]+)")
-                    key = id_pattern.findall(line)[0]
-                    if category not in ("contig", "FILTER", "INFO", "FORMAT"):
-                        # output other structured fields
-                        print(line, file=output)
-                    # only output certain categories if in dataset
-                    elif category == "contig" and key in contigs:
-                        contigs.remove(key)
-                        print(line, file=output)
-                    elif category == "FILTER" and key in filters:
-                        filters.remove(key)
-                        print(line, file=output)
-                    elif category == "INFO" and key in info_fields:
-                        info_fields.remove(key)
-                        print(line, file=output)
-                    elif category == "FORMAT" and key in format_fields:
-                        format_fields.remove(key)
-                        print(line, file=output)
-
-    # add all fields that are not in the original header
-    # or all fields if there was no original header
+
+    # [1.4.1 File format]
+    print("##fileformat=VCFv4.3", file=output)
+
+    if "source" in ds.attrs:
+        print(f'##source={ds.attrs["source"]}', file=output)
 
     # [1.4.2 Information field format]
     for key in info_fields:

From 808a97eff05705156994bfe58050995b3cff2d90 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Tue, 15 Apr 2025 11:46:46 +0100
Subject: [PATCH 3/7] Fix contig lengths

---
 vcztools/vcf_writer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py
index ed84f3e..f7eca31 100644
--- a/vcztools/vcf_writer.py
+++ b/vcztools/vcf_writer.py
@@ -392,9 +392,7 @@ def _generate_header(
             )
 
     # [1.4.7 Contig field format]
-    contig_lengths = (
-        ds.attrs["contig_lengths"] if "contig_lengths" in ds.attrs else None
-    )
+    contig_lengths = ds["contig_length"] if "contig_length" in ds else None
     for i, contig in enumerate(contigs):
         if contig_lengths is None:
             print(f"##contig=<ID={contig}>", file=output)

From 266b546653b8b213eb417f2dc4ea6a0c26f6efa5 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Wed, 16 Apr 2025 09:57:41 +0100
Subject: [PATCH 4/7] Filter descriptions

---
 vcztools/vcf_writer.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py
index f7eca31..16b34d8 100644
--- a/vcztools/vcf_writer.py
+++ b/vcztools/vcf_writer.py
@@ -368,8 +368,17 @@ def _generate_header(
                 )
 
     # [1.4.3 Filter field format]
-    for filter in filters:
-        print(f'##FILTER=<ID={filter},Description="">', file=output)
+    filter_descriptions = (
+        ds["filter_description"] if "filter_description" in ds else None
+    )
+    for i, filter in enumerate(filters):
+        filter_description = (
+            "" if filter_descriptions is None else filter_descriptions[i]
+        )
+        print(
+            f'##FILTER=<ID={filter},Description="{filter_description}">',
+            file=output,
+        )
 
     # [1.4.4 Individual format field format]
     for key in format_fields:

From cbda92ab297000791aaaec4d898792a5e1404edc Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Wed, 16 Apr 2025 10:14:45 +0100
Subject: [PATCH 5/7] Loosen test for checking if VCF headers are the same

---
 tests/test_vcf_writer.py | 18 ++++++------------
 tests/utils.py           | 41 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py
index a00f5e9..625eb66 100644
--- a/tests/test_vcf_writer.py
+++ b/tests/test_vcf_writer.py
@@ -6,7 +6,6 @@
 import numpy as np
 import pytest
 import zarr
-from bio2zarr import vcf
 from cyvcf2 import VCF
 from numpy.testing import assert_array_equal
 
@@ -301,15 +300,9 @@ def test_write_vcf__header_flags(tmp_path):
     assert_vcfs_close(original, output)
 
 
-def test_write_vcf__generate_header(tmp_path):
+def test_write_vcf__generate_header():
     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
-    # don't use cache here since we mutate the vcz
-    vcz = tmp_path.joinpath("intermediate.vcz")
-    vcf.convert([original], vcz, worker_processes=0, local_alleles=False)
-
-    # remove vcf_header
-    root = zarr.open(vcz, mode="r+")
-    del root.attrs["vcf_header"]
+    vcz = vcz_path_cache(original)
 
     output_header = StringIO()
     write_vcf(vcz, output_header, header_only=True, no_version=True)
@@ -324,9 +317,9 @@ def test_write_vcf__generate_header(tmp_path):
 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
 ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
-##FILTER=<ID=PASS,Description="">
-##FILTER=<ID=s50,Description="">
-##FILTER=<ID=q10,Description="">
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FILTER=<ID=q10,Description="Quality below 10">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
@@ -338,6 +331,7 @@ def test_write_vcf__generate_header(tmp_path):
 """  # noqa: E501
 
     # substitute value of source
+    root = zarr.open(vcz, mode="r+")
     expected_vcf_header = expected_vcf_header.format(root.attrs["source"])
 
     assert output_header.getvalue() == expected_vcf_header
diff --git a/tests/utils.py b/tests/utils.py
index f7fda49..156edd8 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -29,6 +29,45 @@ def normalise_info_missingness(info_dict, key):
     return value
 
 
+def _get_headers(vcf, header_type):
+    def to_dict(header_field):
+        d = header_field.info(extra=True)
+        del d[b"IDX"]  # remove IDX since we don't care about ordering
+
+        # cyvcf2 duplicates some keys as strings and bytes, so remove the bytes one
+        for k in list(d.keys()):
+            if isinstance(k, bytes) and k.decode("utf-8") in d:
+                del d[k]
+        return d
+
+    return {
+        field["ID"]: to_dict(field)
+        for field in vcf.header_iter()
+        if field["HeaderType"] == header_type
+    }
+
+
+def _assert_vcf_headers_equivalent(vcf1, vcf2):
+    # Only compare INFO, FORMAT, FILTER, CONTIG fields, ignoring order
+    # Other fields are ignored
+
+    info1 = _get_headers(vcf1, "INFO")
+    info2 = _get_headers(vcf2, "INFO")
+    assert info1 == info2
+
+    format1 = _get_headers(vcf1, "FORMAT")
+    format2 = _get_headers(vcf2, "FORMAT")
+    assert format1 == format2
+
+    filter1 = _get_headers(vcf1, "FILTER")
+    filter2 = _get_headers(vcf2, "FILTER")
+    assert filter1 == filter2
+
+    contig1 = _get_headers(vcf1, "CONTIG")
+    contig2 = _get_headers(vcf2, "CONTIG")
+    assert contig1 == contig2
+
+
 def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=False):
     """Like :py:func:`numpy.testing.assert_allclose()`, but for VCF files.
 
@@ -48,7 +87,7 @@ def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=Fal
         Absolute tolerance.
     """
     with open_vcf(f1) as vcf1, open_vcf(f2) as vcf2:
-        assert vcf1.raw_header == vcf2.raw_header
+        _assert_vcf_headers_equivalent(vcf1, vcf2)
         assert vcf1.samples == vcf2.samples
 
         count = 0

From bdd9d2526e03a8e095a0191f50815d5a03446999 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Tue, 22 Apr 2025 15:14:01 +0100
Subject: [PATCH 6/7] Match Number="." properly

---
 tests/utils.py | 45 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/tests/utils.py b/tests/utils.py
index 156edd8..54a1409 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -29,7 +29,7 @@ def normalise_info_missingness(info_dict, key):
     return value
 
 
-def _get_headers(vcf, header_type):
+def _get_header_field_dicts(vcf, header_type):
     def to_dict(header_field):
         d = header_field.info(extra=True)
         del d[b"IDX"]  # remove IDX since we don't care about ordering
@@ -47,25 +47,44 @@ def to_dict(header_field):
     }
 
 
+def _assert_header_field_dicts_equivalent(field_dicts1, field_dicts2):
+    assert len(field_dicts1) == len(field_dicts2)
+
+    for id in field_dicts1.keys():
+        assert id in field_dicts2
+        field_dict1 = field_dicts1[id]
+        field_dict2 = field_dicts2[id]
+
+        assert len(field_dict1) == len(field_dict2)
+        # all fields should be the same, except Number="." which can match any value
+        for k in field_dict1.keys():
+            assert k in field_dict2
+            v1 = field_dict1[k]
+            v2 = field_dict2[k]
+            if k == "Number" and (v1 == "." or v2 == "."):
+                continue
+            assert v1 == v2, f"Failed in field {id} with key {k}"
+
+
 def _assert_vcf_headers_equivalent(vcf1, vcf2):
     # Only compare INFO, FORMAT, FILTER, CONTIG fields, ignoring order
     # Other fields are ignored
 
-    info1 = _get_headers(vcf1, "INFO")
-    info2 = _get_headers(vcf2, "INFO")
-    assert info1 == info2
+    info1 = _get_header_field_dicts(vcf1, "INFO")
+    info2 = _get_header_field_dicts(vcf2, "INFO")
+    _assert_header_field_dicts_equivalent(info1, info2)
 
-    format1 = _get_headers(vcf1, "FORMAT")
-    format2 = _get_headers(vcf2, "FORMAT")
-    assert format1 == format2
+    format1 = _get_header_field_dicts(vcf1, "FORMAT")
+    format2 = _get_header_field_dicts(vcf2, "FORMAT")
+    _assert_header_field_dicts_equivalent(format1, format2)
 
-    filter1 = _get_headers(vcf1, "FILTER")
-    filter2 = _get_headers(vcf2, "FILTER")
-    assert filter1 == filter2
+    filter1 = _get_header_field_dicts(vcf1, "FILTER")
+    filter2 = _get_header_field_dicts(vcf2, "FILTER")
+    _assert_header_field_dicts_equivalent(filter1, filter2)
 
-    contig1 = _get_headers(vcf1, "CONTIG")
-    contig2 = _get_headers(vcf2, "CONTIG")
-    assert contig1 == contig2
+    contig1 = _get_header_field_dicts(vcf1, "CONTIG")
+    contig2 = _get_header_field_dicts(vcf2, "CONTIG")
+    _assert_header_field_dicts_equivalent(contig1, contig2)
 
 
 def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=False):

From 25fcb2543a474429399dcf977a11d851c97f776e Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Tue, 13 May 2025 11:53:12 +0100
Subject: [PATCH 7/7] Use any non-redundant meta information lines stored by
 bio2zarr

---
 tests/test_vcf_writer.py | 5 +++++
 vcztools/vcf_writer.py   | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py
index 625eb66..1f32705 100644
--- a/tests/test_vcf_writer.py
+++ b/tests/test_vcf_writer.py
@@ -327,6 +327,11 @@ def test_write_vcf__generate_header():
 ##contig=<ID=19>
 ##contig=<ID=20>
 ##contig=<ID=X>
+##fileDate=20090805
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
+##ALT=<ID=CNV,Description="Copy number variable region">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
 """  # noqa: E501
 
diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py
index 16b34d8..059ba09 100644
--- a/vcztools/vcf_writer.py
+++ b/vcztools/vcf_writer.py
@@ -414,6 +414,12 @@ def _generate_header(
             file=output,
         )
 
+    # Other meta information lines not covered above
+    if "vcf_meta_information" in ds.attrs:
+        for key, value in ds.attrs["vcf_meta_information"]:
+            if key not in ("fileformat", "source"):
+                print(f"##{key}={value}", file=output)
+
     # [1.5 Header line syntax]
     print(
         "#CHROM",