Skip to content

Commit b22ff25

Browse files
Detect VCF fields that would clobber VCF Zarr
Closes #32
1 parent 6071cca commit b22ff25

File tree

2 files changed

+83
-0
lines changed

2 files changed

+83
-0
lines changed

bio2zarr/vcf.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -962,6 +962,26 @@ def check_overlapping_partitions(partitions):
962962
)
963963

964964

965+
def check_field_clobbering(icf_metadata):
966+
info_field_names = set(field.name for field in icf_metadata.info_fields)
967+
fixed_variant_fields = set(
968+
["contig", "id", "id_mask", "position", "allele", "filter", "quality"]
969+
)
970+
intersection = info_field_names & fixed_variant_fields
971+
if len(intersection) > 0:
972+
raise ValueError(
973+
f"INFO field name(s) clashing with VCF Zarr spec: {intersection}"
974+
)
975+
976+
format_field_names = set(field.name for field in icf_metadata.format_fields)
977+
fixed_variant_fields = set(["genotype", "genotype_phased", "genotype_mask"])
978+
intersection = format_field_names & fixed_variant_fields
979+
if len(intersection) > 0:
980+
raise ValueError(
981+
f"FORMAT field name(s) clashing with VCF Zarr spec: {intersection}"
982+
)
983+
984+
965985
class IntermediateColumnarFormatWriter:
966986
def __init__(self, path):
967987
self.path = pathlib.Path(path)
@@ -996,6 +1016,7 @@ def init(
9961016
show_progress=show_progress,
9971017
target_num_partitions=target_num_partitions,
9981018
)
1019+
check_field_clobbering(icf_metadata)
9991020
self.metadata = icf_metadata
10001021
self.metadata.format_version = ICF_METADATA_FORMAT_VERSION
10011022
self.metadata.compressor = compressor.get_config()

tests/test_vcf.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22

3+
import pysam
34
import pytest
45
import sgkit as sg
56
import xarray.testing as xt
@@ -504,3 +505,64 @@ def test_encode_partition_out_of_range(self, icf_path, tmp_path, partition):
504505
vcf.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3)
505506
with pytest.raises(ValueError, match="Partition index not in the valid range"):
506507
vcf.encode_partition(zarr_path, partition)
508+
509+
510+
class TestClobberFixedFields:
511+
def generate_vcf(self, path, info_field=None, format_field=None, num_rows=1):
512+
with open(path, "w") as out:
513+
print("##fileformat=VCFv4.2", file=out)
514+
print('##FILTER=<ID=PASS,Description="All filters passed">', file=out)
515+
print("##contig=<ID=1>", file=out)
516+
if info_field is not None:
517+
print(
518+
f'##INFO=<ID={info_field},Number=1,Type=Float,Description="">',
519+
file=out,
520+
)
521+
if format_field is not None:
522+
print(
523+
f'##FORMAT=<ID={format_field},Number=1,Type=Float,Description="">',
524+
file=out,
525+
)
526+
header = "\t".join(
527+
["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
528+
)
529+
print(header, file=out)
530+
for k in range(num_rows):
531+
pos = str(k + 1)
532+
print("\t".join(["1", pos, "A", "T", ".", ".", ".", "."]), file=out)
533+
534+
print(open(path).read())
535+
# This also compresses the input file
536+
pysam.tabix_index(str(path), preset="vcf")
537+
538+
@pytest.mark.parametrize(
539+
"field",
540+
[
541+
"contig",
542+
"id",
543+
"id_mask",
544+
"position",
545+
"allele",
546+
"filter",
547+
"quality",
548+
],
549+
)
550+
def test_variant_fields(self, tmp_path, field):
551+
vcf_file = tmp_path / "test.vcf"
552+
self.generate_vcf(vcf_file, info_field=field)
553+
with pytest.raises(ValueError, match=f"INFO field name.*{field}"):
554+
vcf.explode(tmp_path / "x.icf", [tmp_path / "test.vcf.gz"])
555+
556+
@pytest.mark.parametrize(
557+
"field",
558+
[
559+
"genotype",
560+
"genotype_phased",
561+
"genotype_mask",
562+
],
563+
)
564+
def test_call_fields(self, tmp_path, field):
565+
vcf_file = tmp_path / "test.vcf"
566+
self.generate_vcf(vcf_file, format_field=field)
567+
with pytest.raises(ValueError, match=f"FORMAT field name.*{field}"):
568+
vcf.explode(tmp_path / "x.icf", [tmp_path / "test.vcf.gz"])

0 commit comments

Comments
 (0)