Skip to content

Commit 28fa419

Browse files
Handle some awkward corner cases
1 parent 9aba983 commit 28fa419

File tree

4 files changed

+33
-2
lines changed

4 files changed

+33
-2
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# 0.1.2 2024-XX-XX
22

3+
- Reduce memory requirement for encoding genotypes with large sample sizes
4+
35
- Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300)
46

57
- Add chunksize options to mkschema (issue:294)

bio2zarr/vcf2zarr/vcz.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ def fixed_field_spec(
382382
continue
383383
array_specs.append(spec_from_field(field))
384384

385-
if gt_field is not None:
385+
if gt_field is not None and n > 0:
386386
ploidy = max(gt_field.summary.max_number - 1, 1)
387387
shape = [m, n]
388388
chunks = [variants_chunk_size, samples_chunk_size]
@@ -832,6 +832,7 @@ def encode_partition(self, partition_index):
832832
self.encode_array_partition(array_spec, partition_index)
833833
if self.has_genotypes():
834834
self.encode_genotypes_partition(partition_index)
835+
self.encode_genotype_mask_partition(partition_index)
835836
if self.has_local_alleles():
836837
self.encode_local_alleles_partition(partition_index)
837838
self.encode_local_allele_fields_partition(partition_index)
@@ -899,8 +900,10 @@ def encode_genotypes_partition(self, partition_index):
899900
self.finalise_partition_array(partition_index, gt)
900901
self.finalise_partition_array(partition_index, gt_phased)
901902

902-
# Read back in the genotypes so we can compute the mask
903+
def encode_genotype_mask_partition(self, partition_index):
904+
partition = self.metadata.partitions[partition_index]
903905
gt_mask = self.init_partition_array(partition_index, "call_genotype_mask")
906+
# Read back in the genotypes so we can compute the mask
904907
gt_array = zarr.open_array(
905908
store=self.wip_partition_array_path(partition_index, "call_genotype"),
906909
mode="r",

tests/test_icf.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,19 @@ def test_INFO_NS(self, icf):
9191
assert icf["INFO/NS"].values == [None, None, 3, 3, 2, 3, 3, None, None]
9292

9393

94+
class TestWithGtHeaderNoGenotypes:
95+
data_path = "tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz"
96+
97+
@pytest.fixture(scope="class")
98+
def icf(self, tmp_path_factory):
99+
out = tmp_path_factory.mktemp("data") / "example.exploded"
100+
return vcf2zarr.explode(out, [self.data_path])
101+
102+
def test_gts(self, icf):
103+
values = icf["FORMAT/GT"].values
104+
assert values == [None] * icf.num_records
105+
106+
94107
class TestIcfWriterExample:
95108
data_path = "tests/data/vcf/sample.vcf.gz"
96109

tests/test_vcf_examples.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,19 @@ def test_ok_without_local_alleles(self, ds):
508508
nt.assert_array_equal(ds.call_genotype.values, [[[0, 0, 0]]])
509509

510510

511+
class TestWithGtHeaderNoGenotypes:
512+
data_path = "tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz"
513+
514+
@pytest.fixture(scope="class")
515+
def ds(self, tmp_path_factory):
516+
out = tmp_path_factory.mktemp("data") / "example.vcf.zarr"
517+
vcf2zarr.convert([self.data_path], out, worker_processes=0)
518+
return sg.load_dataset(out)
519+
520+
def test_gts(self, ds):
521+
assert "call_genotype" not in ds
522+
523+
511524
class Test1000G2020Example:
512525
data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"
513526

0 commit comments

Comments
 (0)