Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# 0.1.2 2024-XX-XX

- Reduce memory requirement for encoding genotypes with large sample sizes

- Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300)

- Add chunksize options to mkschema (issue:294)
Expand Down
29 changes: 19 additions & 10 deletions bio2zarr/vcf2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ def fixed_field_spec(
continue
array_specs.append(spec_from_field(field))

if gt_field is not None:
if gt_field is not None and n > 0:
ploidy = max(gt_field.summary.max_number - 1, 1)
shape = [m, n]
chunks = [variants_chunk_size, samples_chunk_size]
Expand Down Expand Up @@ -832,6 +832,7 @@ def encode_partition(self, partition_index):
self.encode_array_partition(array_spec, partition_index)
if self.has_genotypes():
self.encode_genotypes_partition(partition_index)
self.encode_genotype_mask_partition(partition_index)
if self.has_local_alleles():
self.encode_local_alleles_partition(partition_index)
self.encode_local_allele_fields_partition(partition_index)
Expand Down Expand Up @@ -881,14 +882,10 @@ def encode_array_partition(self, array_spec, partition_index):
self.finalise_partition_array(partition_index, ba)

def encode_genotypes_partition(self, partition_index):
# FIXME we should be doing these one at a time, reading back in the genotypes
# like we do for local alleles
partition = self.metadata.partitions[partition_index]
gt = self.init_partition_array(partition_index, "call_genotype")
gt_mask = self.init_partition_array(partition_index, "call_genotype_mask")
gt_phased = self.init_partition_array(partition_index, "call_genotype_phased")

partition = self.metadata.partitions[partition_index]

source_field = self.icf.fields["FORMAT/GT"]
for value in source_field.iter_values(partition.start, partition.stop):
j = gt.next_buffer_row()
Expand All @@ -899,13 +896,25 @@ def encode_genotypes_partition(self, partition_index):
icf.sanitise_value_int_1d(
gt_phased.buff, j, value[:, -1] if value is not None else None
)
# TODO check is this the correct semantics when we are padding
# with mixed ploidies?
j = gt_mask.next_buffer_row()
gt_mask.buff[j] = gt.buff[j] < 0

self.finalise_partition_array(partition_index, gt)
self.finalise_partition_array(partition_index, gt_phased)

def encode_genotype_mask_partition(self, partition_index):
partition = self.metadata.partitions[partition_index]
gt_mask = self.init_partition_array(partition_index, "call_genotype_mask")
# Read back in the genotypes so we can compute the mask
gt_array = zarr.open_array(
store=self.wip_partition_array_path(partition_index, "call_genotype"),
mode="r",
)
for genotypes in core.first_dim_slice_iter(
gt_array, partition.start, partition.stop
):
# TODO check is this the correct semantics when we are padding
# with mixed ploidies?
j = gt_mask.next_buffer_row()
gt_mask.buff[j] = genotypes < 0
self.finalise_partition_array(partition_index, gt_mask)

def encode_local_alleles_partition(self, partition_index):
Expand Down
13 changes: 13 additions & 0 deletions tests/test_icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,19 @@ def test_INFO_NS(self, icf):
assert icf["INFO/NS"].values == [None, None, 3, 3, 2, 3, 3, None, None]


class TestWithGtHeaderNoGenotypes:
data_path = "tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz"

@pytest.fixture(scope="class")
def icf(self, tmp_path_factory):
out = tmp_path_factory.mktemp("data") / "example.exploded"
return vcf2zarr.explode(out, [self.data_path])

def test_gts(self, icf):
values = icf["FORMAT/GT"].values
assert values == [None] * icf.num_records


class TestIcfWriterExample:
data_path = "tests/data/vcf/sample.vcf.gz"

Expand Down
13 changes: 13 additions & 0 deletions tests/test_vcf_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,19 @@ def test_ok_without_local_alleles(self, ds):
nt.assert_array_equal(ds.call_genotype.values, [[[0, 0, 0]]])


class TestWithGtHeaderNoGenotypes:
data_path = "tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz"

@pytest.fixture(scope="class")
def ds(self, tmp_path_factory):
out = tmp_path_factory.mktemp("data") / "example.vcf.zarr"
vcf2zarr.convert([self.data_path], out, worker_processes=0)
return sg.load_dataset(out)

def test_gts(self, ds):
assert "call_genotype" not in ds


class Test1000G2020Example:
data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"

Expand Down
Loading