From 67adc3ad67ff12fb08ba1ec8124bbdfb03a7b68a Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 15 Apr 2025 16:55:37 +0100 Subject: [PATCH 1/7] Fix bio2zarr import --- tests/test_stats.py | 4 ++-- tests/test_vcf_writer.py | 4 ++-- tests/utils.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_stats.py b/tests/test_stats.py index 7a28c3f..9f1b9c3 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -3,7 +3,7 @@ import pytest import zarr -from bio2zarr import vcf2zarr +from bio2zarr import vcf from vcztools.stats import nrecords, stats @@ -39,7 +39,7 @@ def test_stats__no_index(tmp_path): original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" # don't use cache here since we want to make sure vcz is not indexed vcz = tmp_path.joinpath("intermediate.vcz") - vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False) + vcf.convert([original], vcz, worker_processes=0, local_alleles=False) # delete the index created by vcf2zarr root = zarr.open(vcz, mode="a") diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py index 6ec9619..a00f5e9 100644 --- a/tests/test_vcf_writer.py +++ b/tests/test_vcf_writer.py @@ -6,7 +6,7 @@ import numpy as np import pytest import zarr -from bio2zarr import vcf2zarr +from bio2zarr import vcf from cyvcf2 import VCF from numpy.testing import assert_array_equal @@ -305,7 +305,7 @@ def test_write_vcf__generate_header(tmp_path): original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" # don't use cache here since we mutate the vcz vcz = tmp_path.joinpath("intermediate.vcz") - vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False) + vcf.convert([original], vcz, worker_processes=0, local_alleles=False) # remove vcf_header root = zarr.open(vcz, mode="r+") diff --git a/tests/utils.py b/tests/utils.py index 6eb5c29..f7fda49 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -5,7 +5,7 @@ import cyvcf2 import numpy as np -from bio2zarr import vcf2zarr +from bio2zarr import vcf @contextmanager @@ -145,7 +145,7 @@ def vcz_path_cache(vcf_path): cached_vcz_path = (cache_path / vcf_path.name).with_suffix(".vcz") if not cached_vcz_path.exists(): if vcf_path.name.startswith("chr22"): - vcf2zarr.convert( + vcf.convert( [vcf_path], cached_vcz_path, worker_processes=0, @@ -153,7 +153,7 @@ def vcz_path_cache(vcf_path): samples_chunk_size=10, ) else: - vcf2zarr.convert( + vcf.convert( [vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False ) return cached_vcz_path From 3079a413f52c940102d52d9205972418d750f6ba Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 16 Apr 2025 10:14:40 +0100 Subject: [PATCH 2/7] Don't use original VCF header in vcz --- vcztools/vcf_writer.py | 49 ++++++------------------------------------ 1 file changed, 6 insertions(+), 43 deletions(-) diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index c402880..ed84f3e 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -1,6 +1,5 @@ import io import logging -import re import sys from datetime import datetime @@ -144,11 +143,9 @@ def write_vcf( ) if not no_header: - original_header = root.attrs.get("vcf_header", None) force_ac_an_header = not drop_genotypes and samples_selection is not None vcf_header = _generate_header( root, - original_header, sample_ids, no_version=no_version, force_ac_an=force_ac_an_header, @@ -299,7 +296,6 @@ def c_chunk_to_vcf( def _generate_header( ds, - original_header, sample_ids, *, no_version: bool = False, @@ -339,45 +335,12 @@ def _generate_header( if key in ("genotype", "genotype_phased"): continue format_fields.append(key) - if original_header is None: # generate entire header - # [1.4.1 File format] - print("##fileformat=VCFv4.3", file=output) - - if "source" in ds.attrs: - print(f'##source={ds.attrs["source"]}', file=output) - - else: # use original header fields where appropriate - unstructured_pattern = re.compile("##([^=]+)=([^<].*)") - structured_pattern = re.compile("##([^=]+)=(<.*)") - - for line in original_header.split("\n"): - if re.fullmatch(unstructured_pattern, line): - print(line, file=output) - else: - match = re.fullmatch(structured_pattern, line) - if match: - category = match.group(1) - id_pattern = re.compile("ID=([^,>]+)") - key = id_pattern.findall(line)[0] - if category not in ("contig", "FILTER", "INFO", "FORMAT"): - # output other structured fields - print(line, file=output) - # only output certain categories if in dataset - elif category == "contig" and key in contigs: - contigs.remove(key) - print(line, file=output) - elif category == "FILTER" and key in filters: - filters.remove(key) - print(line, file=output) - elif category == "INFO" and key in info_fields: - info_fields.remove(key) - print(line, file=output) - elif category == "FORMAT" and key in format_fields: - format_fields.remove(key) - print(line, file=output) - - # add all fields that are not in the original header - # or all fields if there was no original header + + # [1.4.1 File format] + print("##fileformat=VCFv4.3", file=output) + + if "source" in ds.attrs: + print(f'##source={ds.attrs["source"]}', file=output) # [1.4.2 Information field format] for key in info_fields: From 808a97eff05705156994bfe58050995b3cff2d90 Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 15 Apr 2025 11:46:46 +0100 Subject: [PATCH 3/7] Fix contig lengths --- vcztools/vcf_writer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index ed84f3e..f7eca31 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -392,9 +392,7 @@ def _generate_header( ) # [1.4.7 Contig field format] - contig_lengths = ( - ds.attrs["contig_lengths"] if "contig_lengths" in ds.attrs else None - ) + contig_lengths = ds["contig_length"] if "contig_length" in ds else None for i, contig in enumerate(contigs): if contig_lengths is None: print(f"##contig=", file=output) From 266b546653b8b213eb417f2dc4ea6a0c26f6efa5 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 16 Apr 2025 09:57:41 +0100 Subject: [PATCH 4/7] Filter descriptions --- vcztools/vcf_writer.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index f7eca31..16b34d8 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -368,8 +368,17 @@ def _generate_header( ) # [1.4.3 Filter field format] - for filter in filters: - print(f'##FILTER=', file=output) + filter_descriptions = ( + ds["filter_description"] if "filter_description" in ds else None + ) + for i, filter in enumerate(filters): + filter_description = ( + "" if filter_descriptions is None else filter_descriptions[i] + ) + print( + f'##FILTER=', + file=output, + ) # [1.4.4 Individual format field format] for key in format_fields: From cbda92ab297000791aaaec4d898792a5e1404edc Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 16 Apr 2025 10:14:45 +0100 Subject: [PATCH 5/7] Loosen test for checking if VCF headers are the same --- tests/test_vcf_writer.py | 18 ++++++------------ tests/utils.py | 41 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py index a00f5e9..625eb66 100644 --- a/tests/test_vcf_writer.py +++ b/tests/test_vcf_writer.py @@ -6,7 +6,6 @@ import numpy as np import pytest import zarr -from bio2zarr import vcf from cyvcf2 import VCF from numpy.testing import assert_array_equal @@ -301,15 +300,9 @@ def test_write_vcf__header_flags(tmp_path): assert_vcfs_close(original, output) -def test_write_vcf__generate_header(tmp_path): +def test_write_vcf__generate_header(): original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" - # don't use cache here since we mutate the vcz - vcz = tmp_path.joinpath("intermediate.vcz") - vcf.convert([original], vcz, worker_processes=0, local_alleles=False) - - # remove vcf_header - root = zarr.open(vcz, mode="r+") - del root.attrs["vcf_header"] + vcz = vcz_path_cache(original) output_header = StringIO() write_vcf(vcz, output_header, header_only=True, no_version=True) @@ -324,9 +317,9 @@ def test_write_vcf__generate_header(tmp_path): ##INFO= ##INFO= ##INFO= -##FILTER= -##FILTER= -##FILTER= +##FILTER= +##FILTER= +##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= @@ -338,6 +331,7 @@ def test_write_vcf__generate_header(tmp_path): """ # noqa: E501 # substitute value of source + root = zarr.open(vcz, mode="r+") expected_vcf_header = expected_vcf_header.format(root.attrs["source"]) assert output_header.getvalue() == expected_vcf_header diff --git a/tests/utils.py b/tests/utils.py index f7fda49..156edd8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -29,6 +29,45 @@ def normalise_info_missingness(info_dict, key): return value +def _get_headers(vcf, header_type): + def to_dict(header_field): + d = header_field.info(extra=True) + del d[b"IDX"] # remove IDX since we don't care about ordering + + # cyvcf2 duplicates some keys as strings and bytes, so remove the bytes one + for k in list(d.keys()): + if isinstance(k, bytes) and k.decode("utf-8") in d: + del d[k] + return d + + return { + field["ID"]: to_dict(field) + for field in vcf.header_iter() + if field["HeaderType"] == header_type + } + + +def _assert_vcf_headers_equivalent(vcf1, vcf2): + # Only compare INFO, FORMAT, FILTER, CONTIG fields, ignoring order + # Other fields are ignored + + info1 = _get_headers(vcf1, "INFO") + info2 = _get_headers(vcf2, "INFO") + assert info1 == info2 + + format1 = _get_headers(vcf1, "FORMAT") + format2 = _get_headers(vcf2, "FORMAT") + assert format1 == format2 + + filter1 = _get_headers(vcf1, "FILTER") + filter2 = _get_headers(vcf2, "FILTER") + assert filter1 == filter2 + + contig1 = _get_headers(vcf1, "CONTIG") + contig2 = _get_headers(vcf2, "CONTIG") + assert contig1 == contig2 + + def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=False): """Like :py:func:`numpy.testing.assert_allclose()`, but for VCF files. @@ -48,7 +87,7 @@ def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=Fal Absolute tolerance. """ with open_vcf(f1) as vcf1, open_vcf(f2) as vcf2: - assert vcf1.raw_header == vcf2.raw_header + _assert_vcf_headers_equivalent(vcf1, vcf2) assert vcf1.samples == vcf2.samples count = 0 From bdd9d2526e03a8e095a0191f50815d5a03446999 Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 22 Apr 2025 15:14:01 +0100 Subject: [PATCH 6/7] Match Number="." properly --- tests/utils.py | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 156edd8..54a1409 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -29,7 +29,7 @@ def normalise_info_missingness(info_dict, key): return value -def _get_headers(vcf, header_type): +def _get_header_field_dicts(vcf, header_type): def to_dict(header_field): d = header_field.info(extra=True) del d[b"IDX"] # remove IDX since we don't care about ordering @@ -47,25 +47,44 @@ def to_dict(header_field): } +def _assert_header_field_dicts_equivalent(field_dicts1, field_dicts2): + assert len(field_dicts1) == len(field_dicts2) + + for id in field_dicts1.keys(): + assert id in field_dicts2 + field_dict1 = field_dicts1[id] + field_dict2 = field_dicts2[id] + + assert len(field_dict1) == len(field_dict2) + # all fields should be the same, except Number="." which can match any value + for k in field_dict1.keys(): + assert k in field_dict2 + v1 = field_dict1[k] + v2 = field_dict2[k] + if k == "Number" and (v1 == "." or v2 == "."): + continue + assert v1 == v2, f"Failed in field {id} with key {k}" + + def _assert_vcf_headers_equivalent(vcf1, vcf2): # Only compare INFO, FORMAT, FILTER, CONTIG fields, ignoring order # Other fields are ignored - info1 = _get_headers(vcf1, "INFO") - info2 = _get_headers(vcf2, "INFO") - assert info1 == info2 + info1 = _get_header_field_dicts(vcf1, "INFO") + info2 = _get_header_field_dicts(vcf2, "INFO") + _assert_header_field_dicts_equivalent(info1, info2) - format1 = _get_headers(vcf1, "FORMAT") - format2 = _get_headers(vcf2, "FORMAT") - assert format1 == format2 + format1 = _get_header_field_dicts(vcf1, "FORMAT") + format2 = _get_header_field_dicts(vcf2, "FORMAT") + _assert_header_field_dicts_equivalent(format1, format2) - filter1 = _get_headers(vcf1, "FILTER") - filter2 = _get_headers(vcf2, "FILTER") - assert filter1 == filter2 + filter1 = _get_header_field_dicts(vcf1, "FILTER") + filter2 = _get_header_field_dicts(vcf2, "FILTER") + _assert_header_field_dicts_equivalent(filter1, filter2) - contig1 = _get_headers(vcf1, "CONTIG") - contig2 = _get_headers(vcf2, "CONTIG") - assert contig1 == contig2 + contig1 = _get_header_field_dicts(vcf1, "CONTIG") + contig2 = _get_header_field_dicts(vcf2, "CONTIG") + _assert_header_field_dicts_equivalent(contig1, contig2) def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=False): From 25fcb2543a474429399dcf977a11d851c97f776e Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 13 May 2025 11:53:12 +0100 Subject: [PATCH 7/7] Use any non-redundant meta information lines stored by bio2zarr --- tests/test_vcf_writer.py | 5 +++++ vcztools/vcf_writer.py | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py index 625eb66..1f32705 100644 --- a/tests/test_vcf_writer.py +++ b/tests/test_vcf_writer.py @@ -327,6 +327,11 @@ def test_write_vcf__generate_header(): ##contig= ##contig= ##contig= +##fileDate=20090805 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##ALT= +##ALT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 """ # noqa: E501 diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index 16b34d8..059ba09 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -414,6 +414,12 @@ def _generate_header( file=output, ) + # Other meta information lines not covered above + if "vcf_meta_information" in ds.attrs: + for key, value in ds.attrs["vcf_meta_information"]: + if key not in ("fileformat", "source"): + print(f"##{key}={value}", file=output) + # [1.5 Header line syntax] print( "#CHROM",