Skip to content

Commit 96dfb4a

Browse files
tomwhitejeromekelleher
authored andcommitted
Preserve VCF fields defined in header that don't appear otherwise
1 parent 74f5dc6 commit 96dfb4a

File tree

5 files changed

+30
-5
lines changed

5 files changed

+30
-5
lines changed

bio2zarr/icf.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ def full_name(self):
7979
return self.name
8080
return f"{self.category}/{self.name}"
8181

82+
@property
83+
def max_number(self):
84+
if self.vcf_number in ("R", "A", "G", "."):
85+
return self.summary.max_number
86+
else:
87+
# use declared number if larger than max found
88+
return max(self.summary.max_number, int(self.vcf_number))
89+
8290
def smallest_dtype(self):
8391
"""
8492
Returns the smallest dtype suitable for this field based

bio2zarr/vcz.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -136,12 +136,14 @@ def from_field(
136136
dimensions.append("samples")
137137
if array_name is None:
138138
array_name = prefix + vcf_field.name
139+
139140
# TODO make an option to add in the empty extra dimension
140-
if (
141-
vcf_field.summary.max_number > 0 and vcf_field.vcf_number in ("R", "A", "G")
142-
) or (vcf_field.summary.max_number > 1 or vcf_field.full_name == "FORMAT/LAA"):
143-
shape.append(vcf_field.summary.max_number)
144-
chunks.append(vcf_field.summary.max_number)
141+
max_number = vcf_field.summary.max_number
142+
if (max_number > 0 and vcf_field.vcf_number in ("R", "A", "G")) or (
143+
max_number > 1 or vcf_field.full_name == "FORMAT/LAA"
144+
):
145+
shape.append(max_number)
146+
chunks.append(max_number)
145147
# TODO we should really be checking this to see if the named dimensions
146148
# are actually correct.
147149
if vcf_field.vcf_number == "R":

tests/data/vcf/chr22.vcf.gz

46.5 KB
Binary file not shown.

tests/data/vcf/chr22.vcf.gz.csi

116 Bytes
Binary file not shown.

tests/test_vcf_examples.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,21 @@ def test_gts(self, ds):
563563
assert "call_genotype" not in ds
564564

565565

566+
class TestChr22Example:
567+
data_path = "tests/data/vcf/chr22.vcf.gz"
568+
569+
@pytest.fixture(scope="class")
570+
def ds(self, tmp_path_factory):
571+
out = tmp_path_factory.mktemp("data") / "example.vcf.zarr"
572+
icf.convert([self.data_path], out, worker_processes=0)
573+
return sg.load_dataset(out)
574+
575+
def test_call_SB(self, ds):
576+
# fixes https://github.com/sgkit-dev/bio2zarr/issues/355
577+
assert ds.call_SB.dims == ("variants", "samples", "FORMAT_SB_dim")
578+
assert ds.call_SB.shape == (100, 100, 4)
579+
580+
566581
class Test1000G2020Example:
567582
data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"
568583

0 commit comments

Comments
 (0)