diff --git a/CHANGELOG.md b/CHANGELOG.md index bb74fac7..f97f6e21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # 0.1.2 2024-XX-XX +- Reduce memory requirement for encoding genotypes with large sample sizes + - Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300) - Add chunksize options to mkschema (issue:294) diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 566f6bc7..742e9f61 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -382,7 +382,7 @@ def fixed_field_spec( continue array_specs.append(spec_from_field(field)) - if gt_field is not None: + if gt_field is not None and n > 0: ploidy = max(gt_field.summary.max_number - 1, 1) shape = [m, n] chunks = [variants_chunk_size, samples_chunk_size] @@ -832,6 +832,7 @@ def encode_partition(self, partition_index): self.encode_array_partition(array_spec, partition_index) if self.has_genotypes(): self.encode_genotypes_partition(partition_index) + self.encode_genotype_mask_partition(partition_index) if self.has_local_alleles(): self.encode_local_alleles_partition(partition_index) self.encode_local_allele_fields_partition(partition_index) @@ -881,14 +882,10 @@ def encode_array_partition(self, array_spec, partition_index): self.finalise_partition_array(partition_index, ba) def encode_genotypes_partition(self, partition_index): - # FIXME we should be doing these one at a time, reading back in the genotypes - # like we do for local alleles + partition = self.metadata.partitions[partition_index] gt = self.init_partition_array(partition_index, "call_genotype") - gt_mask = self.init_partition_array(partition_index, "call_genotype_mask") gt_phased = self.init_partition_array(partition_index, "call_genotype_phased") - partition = self.metadata.partitions[partition_index] - source_field = self.icf.fields["FORMAT/GT"] for value in source_field.iter_values(partition.start, partition.stop): j = gt.next_buffer_row() @@ -899,13 +896,25 @@ def encode_genotypes_partition(self, partition_index): icf.sanitise_value_int_1d( gt_phased.buff, j, value[:, -1] if value is not None else None ) - # TODO check is this the correct semantics when we are padding - # with mixed ploidies? - j = gt_mask.next_buffer_row() - gt_mask.buff[j] = gt.buff[j] < 0 self.finalise_partition_array(partition_index, gt) self.finalise_partition_array(partition_index, gt_phased) + + def encode_genotype_mask_partition(self, partition_index): + partition = self.metadata.partitions[partition_index] + gt_mask = self.init_partition_array(partition_index, "call_genotype_mask") + # Read back in the genotypes so we can compute the mask + gt_array = zarr.open_array( + store=self.wip_partition_array_path(partition_index, "call_genotype"), + mode="r", + ) + for genotypes in core.first_dim_slice_iter( + gt_array, partition.start, partition.stop + ): + # TODO check is this the correct semantics when we are padding + # with mixed ploidies? + j = gt_mask.next_buffer_row() + gt_mask.buff[j] = genotypes < 0 self.finalise_partition_array(partition_index, gt_mask) def encode_local_alleles_partition(self, partition_index): diff --git a/tests/test_icf.py b/tests/test_icf.py index b59ae1dd..088be7ab 100644 --- a/tests/test_icf.py +++ b/tests/test_icf.py @@ -91,6 +91,19 @@ def test_INFO_NS(self, icf): assert icf["INFO/NS"].values == [None, None, 3, 3, 2, 3, 3, None, None] +class TestWithGtHeaderNoGenotypes: + data_path = "tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz" + + @pytest.fixture(scope="class") + def icf(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "example.exploded" + return vcf2zarr.explode(out, [self.data_path]) + + def test_gts(self, icf): + values = icf["FORMAT/GT"].values + assert values == [None] * icf.num_records + + class TestIcfWriterExample: data_path = "tests/data/vcf/sample.vcf.gz" diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index c1cb2ae8..3692fe44 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -508,6 +508,19 @@ def test_ok_without_local_alleles(self, ds): nt.assert_array_equal(ds.call_genotype.values, [[[0, 0, 0]]]) +class TestWithGtHeaderNoGenotypes: + data_path = "tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz" + + @pytest.fixture(scope="class") + def ds(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" + vcf2zarr.convert([self.data_path], out, worker_processes=0) + return sg.load_dataset(out) + + def test_gts(self, ds): + assert "call_genotype" not in ds + + class Test1000G2020Example: data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"