diff --git a/CHANGELOG.md b/CHANGELOG.md index a38a6299..a5e6df05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 0.1.4 2025-03-XX + +- Fix bug in handling all-missing genotypes (#328) + # 0.1.3 2025-03-04 - Fix missing dependency issue for packaging diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index 0f3f8566..bd124cb2 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -1095,9 +1095,8 @@ def process_partition(self, partition_index): for field in info_fields: tcw.append(field.full_name, variant.INFO.get(field.name, None)) if has_gt: - if variant.genotype is None: - val = None - else: + val = None + if "GT" in variant.FORMAT and variant.genotype is not None: val = variant.genotype.array() tcw.append("FORMAT/GT", val) for field in format_fields: diff --git a/tests/data/vcf/sample_all_missing_gts.vcf.gz b/tests/data/vcf/sample_all_missing_gts.vcf.gz new file mode 100644 index 00000000..d548a8e6 Binary files /dev/null and b/tests/data/vcf/sample_all_missing_gts.vcf.gz differ diff --git a/tests/data/vcf/sample_all_missing_gts.vcf.gz.csi b/tests/data/vcf/sample_all_missing_gts.vcf.gz.csi new file mode 100644 index 00000000..ba8c19db Binary files /dev/null and b/tests/data/vcf/sample_all_missing_gts.vcf.gz.csi differ diff --git a/tests/test_core.py b/tests/test_core.py index 62f76b88..2f4aa838 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -237,8 +237,8 @@ def test_examples(self, chunk_size, size, start, stop): # It works in CI on Linux, but it'll probably break at some point. # It's also necessary to update these numbers each time a new data # file gets added - ("tests/data", 4981734), - ("tests/data/vcf", 4969597), + ("tests/data", 4983044), + ("tests/data/vcf", 4970907), ("tests/data/vcf/sample.vcf.gz", 1089), ], ) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index f5609153..32170193 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -449,6 +449,29 @@ def test_region_index(self, ds): ) nt.assert_array_equal(ds["region_index"], region_index) + def test_small_example_all_missing_gts(self, ds, tmp_path_factory): + data_path = "tests/data/vcf/sample_all_missing_gts.vcf.gz" + out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" + vcf2zarr.convert([data_path], out, worker_processes=0) + ds2 = sg.load_dataset(out) + + assert_dataset_equal( + ds, + ds2, + drop_vars=["call_genotype", "call_genotype_mask", "call_genotype_phased"], + ) + gt1 = ds["call_genotype"].values + gt1[1] = -1 + nt.assert_array_equal(gt1, ds2["call_genotype"].values) + m1 = ds["call_genotype_mask"].values + m1[1] = True + nt.assert_array_equal(m1, ds2["call_genotype_mask"].values) + p1 = ds["call_genotype_phased"].values + # NOTE: Not sure this is the correct behaviour, but testing here anyway + # to keep a record that this is what we're doing + p1[1] = True + nt.assert_array_equal(p1, ds2["call_genotype_phased"].values) + class TestSmallExampleLocalAlleles: data_path = "tests/data/vcf/sample.vcf.gz"