Skip to content

Commit 33d1d31

Browse files
committed
Iterate alleles and genotypes at the same time
1 parent bd90108 commit 33d1d31

File tree

3 files changed

+70
-71
lines changed

3 files changed

+70
-71
lines changed

bio2zarr/icf.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -970,19 +970,6 @@ def root_attrs(self):
970970
"vcf_header": self.vcf_header,
971971
}
972972

973-
def iter_alleles(self, start, stop, num_alleles):
974-
ref_field = self.fields["REF"]
975-
alt_field = self.fields["ALT"]
976-
977-
for ref, alt in zip(
978-
ref_field.iter_values(start, stop),
979-
alt_field.iter_values(start, stop),
980-
):
981-
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
982-
alleles[0] = ref[0]
983-
alleles[1 : 1 + len(alt)] = alt
984-
yield alleles
985-
986973
def iter_id(self, start, stop):
987974
for value in self.fields["ID"].iter_values(start, stop):
988975
if value is not None:
@@ -1025,14 +1012,32 @@ def iter_field(self, field_name, shape, start, stop):
10251012
for value in source_field.iter_values(start, stop):
10261013
yield sanitiser(value)
10271014

1028-
def iter_genotypes(self, shape, start, stop):
1029-
source_field = self.fields["FORMAT/GT"]
1030-
for value in source_field.iter_values(start, stop):
1031-
genotypes = value[:, :-1] if value is not None else None
1032-
phased = value[:, -1] if value is not None else None
1033-
sanitised_genotypes = sanitise_value_int_2d(shape, genotypes)
1034-
sanitised_phased = sanitise_value_int_1d(shape[:-1], phased)
1035-
yield sanitised_genotypes, sanitised_phased
1015+
def iter_alleles_and_genotypes(
1016+
self, start, stop, include_genotypes, shape, num_alleles
1017+
):
1018+
ref_field = self.fields["REF"]
1019+
alt_field = self.fields["ALT"]
1020+
# Create a dummy gt iterator if genotypes are not included
1021+
1022+
for ref, alt, gt in zip(
1023+
ref_field.iter_values(start, stop),
1024+
alt_field.iter_values(start, stop),
1025+
(None for _ in range(stop - start))
1026+
if not include_genotypes
1027+
else self.fields["FORMAT/GT"].iter_values(start, stop),
1028+
):
1029+
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
1030+
alleles[0] = ref[0]
1031+
alleles[1 : 1 + len(alt)] = alt
1032+
1033+
if include_genotypes:
1034+
genotypes = gt[:, :-1] if gt is not None else None
1035+
phased = gt[:, -1] if gt is not None else None
1036+
sanitised_genotypes = sanitise_value_int_2d(shape, genotypes)
1037+
sanitised_phased = sanitise_value_int_1d(shape[:-1], phased)
1038+
yield alleles, sanitised_genotypes, sanitised_phased
1039+
else:
1040+
yield alleles, None, None
10361041

10371042
def generate_schema(
10381043
self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None

bio2zarr/plink.py

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -31,34 +31,32 @@ def samples(self):
3131
def num_samples(self):
3232
return len(self.samples)
3333

34-
def iter_alleles(self, start, stop, num_alleles):
35-
ref_field = self.bed.allele_1
36-
alt_field = self.bed.allele_2
37-
38-
for ref, alt in zip(
39-
ref_field[start:stop],
40-
alt_field[start:stop],
41-
):
42-
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
43-
alleles[0] = ref
44-
alleles[1 : 1 + len(alt)] = alt
45-
yield alleles
46-
4734
def iter_field(self, field_name, shape, start, stop):
4835
assert field_name == "position" # Only position field is supported from plink
4936
yield from self.bed.bp_position[start:stop]
5037

51-
def iter_genotypes(self, shape, start, stop):
38+
def iter_alleles_and_genotypes(
39+
self, start, stop, include_genotypes, shape, num_alleles
40+
):
41+
if not include_genotypes:
42+
raise NotImplementedError("include_genotypes must be True for plink format")
43+
ref_field = self.bed.allele_1
44+
alt_field = self.bed.allele_2
5245
bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T
5346
gt = np.zeros(shape, dtype=np.int8)
5447
phased = np.zeros(shape[:-1], dtype=bool)
55-
for values in bed_chunk:
48+
for i, (ref, alt) in enumerate(
49+
zip(ref_field[start:stop], alt_field[start:stop])
50+
):
51+
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
52+
alleles[0] = ref
53+
alleles[1 : 1 + len(alt)] = alt
5654
gt[:] = 0
57-
gt[values == -127] = -1
58-
gt[values == 2] = 1
59-
gt[values == 1, 0] = 1
55+
gt[bed_chunk[i] == -127] = -1
56+
gt[bed_chunk[i] == 2] = 1
57+
gt[bed_chunk[i] == 1, 0] = 1
6058

61-
yield gt, phased
59+
yield alleles, gt, phased
6260

6361
def generate_schema(
6462
self,

bio2zarr/vcz.py

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,9 @@ def root_attrs(self):
6969
return {}
7070

7171
@abc.abstractmethod
72-
def iter_alleles(self, start, stop, num_alleles):
73-
pass
74-
75-
@abc.abstractmethod
76-
def iter_genotypes(self, start, stop, num_alleles):
72+
def iter_alleles_and_genotypes(
73+
self, start, stop, include_genotypes, shape, num_alleles
74+
):
7775
pass
7876

7977
def iter_id(self, start, stop):
@@ -718,12 +716,11 @@ def encode_partition(self, partition_index):
718716
self.encode_filters_partition(partition_index)
719717
if "variant_contig" in all_field_names:
720718
self.encode_contig_partition(partition_index)
721-
self.encode_alleles_partition(partition_index)
719+
self.encode_alleles_and_genotypes_partition(partition_index)
722720
for array_spec in self.schema.fields:
723721
if array_spec.source is not None:
724722
self.encode_array_partition(array_spec, partition_index)
725723
if self.has_genotypes():
726-
self.encode_genotypes_partition(partition_index)
727724
self.encode_genotype_mask_partition(partition_index)
728725
if self.has_local_alleles():
729726
self.encode_local_alleles_partition(partition_index)
@@ -774,22 +771,33 @@ def encode_array_partition(self, array_spec, partition_index):
774771

775772
self.finalise_partition_array(partition_index, ba)
776773

777-
def encode_genotypes_partition(self, partition_index):
774+
def encode_alleles_and_genotypes_partition(self, partition_index):
778775
partition = self.metadata.partitions[partition_index]
779-
gt = self.init_partition_array(partition_index, "call_genotype")
780-
gt_phased = self.init_partition_array(partition_index, "call_genotype_phased")
776+
alleles = self.init_partition_array(partition_index, "variant_allele")
777+
has_gt = self.has_genotypes()
778+
shape = None
779+
if has_gt:
780+
gt = self.init_partition_array(partition_index, "call_genotype")
781+
gt_phased = self.init_partition_array(
782+
partition_index, "call_genotype_phased"
783+
)
784+
shape = gt.buff.shape[1:]
781785

782-
for genotype, phased in self.source.iter_genotypes(
783-
gt.buff.shape[1:], partition.start, partition.stop
786+
for alleles_value, genotype, phased in self.source.iter_alleles_and_genotypes(
787+
partition.start, partition.stop, has_gt, shape, alleles.array.shape[1]
784788
):
785-
j = gt.next_buffer_row()
786-
gt.buff[j] = genotype
789+
j_alleles = alleles.next_buffer_row()
790+
alleles.buff[j_alleles] = alleles_value
791+
if has_gt:
792+
j = gt.next_buffer_row()
793+
gt.buff[j] = genotype
794+
j_phased = gt_phased.next_buffer_row()
795+
gt_phased.buff[j_phased] = phased
787796

788-
j_phased = gt_phased.next_buffer_row()
789-
gt_phased.buff[j_phased] = phased
790-
791-
self.finalise_partition_array(partition_index, gt)
792-
self.finalise_partition_array(partition_index, gt_phased)
797+
self.finalise_partition_array(partition_index, alleles)
798+
if has_gt:
799+
self.finalise_partition_array(partition_index, gt)
800+
self.finalise_partition_array(partition_index, gt_phased)
793801

794802
def encode_genotype_mask_partition(self, partition_index):
795803
partition = self.metadata.partitions[partition_index]
@@ -851,18 +859,6 @@ def encode_local_allele_fields_partition(self, partition_index):
851859
buff.buff[j] = descriptor.convert(value, la)
852860
self.finalise_partition_array(partition_index, buff)
853861

854-
def encode_alleles_partition(self, partition_index):
855-
alleles = self.init_partition_array(partition_index, "variant_allele")
856-
partition = self.metadata.partitions[partition_index]
857-
858-
for value in self.source.iter_alleles(
859-
partition.start, partition.stop, alleles.array.shape[1]
860-
):
861-
j = alleles.next_buffer_row()
862-
alleles.buff[j] = value
863-
864-
self.finalise_partition_array(partition_index, alleles)
865-
866862
def encode_id_partition(self, partition_index):
867863
vid = self.init_partition_array(partition_index, "variant_id")
868864
vid_mask = self.init_partition_array(partition_index, "variant_id_mask")

0 commit comments

Comments
 (0)