Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions bio2zarr/icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ def full_name(self):
return self.name
return f"{self.category}/{self.name}"

@property
def max_number(self):
if self.vcf_number in ("R", "A", "G", "."):
return self.summary.max_number
else:
# use declared number if larger than max found
return max(self.summary.max_number, int(self.vcf_number))

def smallest_dtype(self):
"""
Returns the smallest dtype suitable for this field based
Expand Down
10 changes: 7 additions & 3 deletions bio2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,14 @@ def from_field(
dimensions.append("samples")
if array_name is None:
array_name = prefix + vcf_field.name

# TODO make an option to add in the empty extra dimension
if vcf_field.summary.max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
shape.append(vcf_field.summary.max_number)
chunks.append(vcf_field.summary.max_number)
max_number = vcf_field.max_number
if (max_number > 0 and vcf_field.vcf_number in ("R", "A", "G")) or (
max_number > 1 or vcf_field.full_name == "FORMAT/LAA"
):
shape.append(max_number)
chunks.append(max_number)
# TODO we should really be checking this to see if the named dimensions
# are actually correct.
if vcf_field.vcf_number == "R":
Expand Down
Binary file added tests/data/vcf/chr22.vcf.gz
Binary file not shown.
Binary file added tests/data/vcf/chr22.vcf.gz.csi
Binary file not shown.
4 changes: 2 additions & 2 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,8 @@ def test_examples(self, chunk_size, size, start, stop):
# It works in CI on Linux, but it'll probably break at some point.
# It's also necessary to update these numbers each time a new data
# file gets added
("tests/data", 4983044),
("tests/data/vcf", 4970907),
("tests/data", 5030777),
("tests/data/vcf", 5018640),
("tests/data/vcf/sample.vcf.gz", 1089),
],
)
Expand Down
20 changes: 20 additions & 0 deletions tests/test_vcf_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,21 @@ def test_gts(self, ds):
assert "call_genotype" not in ds


class TestChr22Example:
data_path = "tests/data/vcf/chr22.vcf.gz"

@pytest.fixture(scope="class")
def ds(self, tmp_path_factory):
out = tmp_path_factory.mktemp("data") / "example.vcf.zarr"
icf.convert([self.data_path], out, worker_processes=0)
return sg.load_dataset(out)

def test_call_SB(self, ds):
# fixes https://github.com/sgkit-dev/bio2zarr/issues/355
assert ds.call_SB.dims == ("variants", "samples", "FORMAT_SB_dim")
assert ds.call_SB.shape == (100, 100, 4)


class Test1000G2020Example:
data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"

Expand Down Expand Up @@ -997,6 +1012,11 @@ def test_variant_ANN(self, ds):
]
nt.assert_array_equal(ds.variant_ANN.values, variant_ANN)

def test_variant_MLEAF(self, ds):
# fixes https://github.com/sgkit-dev/bio2zarr/issues/353
assert ds.variant_MLEAF.dims == ("variants", "alt_alleles")
assert ds.variant_MLEAF.shape == (21, 1)


class TestGeneratedFieldsExample:
data_path = "tests/data/vcf/field_type_combos.vcf.gz"
Expand Down