Skip to content

Commit 9aba983

Browse files
Reduce encoding memory by computing mask separately
1 parent 9f817a2 commit 9aba983

File tree

1 file changed

+15
-9
lines changed

1 file changed

+15
-9
lines changed

bio2zarr/vcf2zarr/vcz.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -881,14 +881,10 @@ def encode_array_partition(self, array_spec, partition_index):
881881
self.finalise_partition_array(partition_index, ba)
882882

883883
def encode_genotypes_partition(self, partition_index):
884-
# FIXME we should be doing these one at a time, reading back in the genotypes
885-
# like we do for local alleles
884+
partition = self.metadata.partitions[partition_index]
886885
gt = self.init_partition_array(partition_index, "call_genotype")
887-
gt_mask = self.init_partition_array(partition_index, "call_genotype_mask")
888886
gt_phased = self.init_partition_array(partition_index, "call_genotype_phased")
889887

890-
partition = self.metadata.partitions[partition_index]
891-
892888
source_field = self.icf.fields["FORMAT/GT"]
893889
for value in source_field.iter_values(partition.start, partition.stop):
894890
j = gt.next_buffer_row()
@@ -899,13 +895,23 @@ def encode_genotypes_partition(self, partition_index):
899895
icf.sanitise_value_int_1d(
900896
gt_phased.buff, j, value[:, -1] if value is not None else None
901897
)
902-
# TODO check is this the correct semantics when we are padding
903-
# with mixed ploidies?
904-
j = gt_mask.next_buffer_row()
905-
gt_mask.buff[j] = gt.buff[j] < 0
906898

907899
self.finalise_partition_array(partition_index, gt)
908900
self.finalise_partition_array(partition_index, gt_phased)
901+
902+
# Read back in the genotypes so we can compute the mask
903+
gt_mask = self.init_partition_array(partition_index, "call_genotype_mask")
904+
gt_array = zarr.open_array(
905+
store=self.wip_partition_array_path(partition_index, "call_genotype"),
906+
mode="r",
907+
)
908+
for genotypes in core.first_dim_slice_iter(
909+
gt_array, partition.start, partition.stop
910+
):
911+
# TODO check is this the correct semantics when we are padding
912+
# with mixed ploidies?
913+
j = gt_mask.next_buffer_row()
914+
gt_mask.buff[j] = genotypes < 0
909915
self.finalise_partition_array(partition_index, gt_mask)
910916

911917
def encode_local_alleles_partition(self, partition_index):

0 commit comments

Comments
 (0)