Skip to content

Commit 21d0224

Browse files
benjefferyjeromekelleher
authored andcommitted
Add variant_length to output for plink and tskit, make missing arrays for index an error
1 parent 7942838 commit 21d0224

File tree

10 files changed

+138
-43
lines changed

10 files changed

+138
-43
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
- Add contigs to plink output (#344)
66

7+
- Add variant_length and indexing to plink output (#382)
8+
79
Breaking changes
810

911
- Remove explicit sample, contig and filter lists from the schema.

bio2zarr/plink.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
6363
gt[bed_chunk[i] == 2] = 1
6464
gt[bed_chunk[i] == 1, 0] = 1
6565

66-
yield alleles, (gt, phased)
66+
yield vcz.VariantData(max(len(a) for a in alleles), alleles, gt, phased)
6767

6868
def generate_schema(
6969
self,
@@ -112,6 +112,13 @@ def generate_schema(
112112
dimensions=["variants", "alleles"],
113113
description=None,
114114
),
115+
vcz.ZarrArraySpec(
116+
source=None,
117+
name="variant_length",
118+
dtype="i4",
119+
dimensions=["variants"],
120+
description="Length of each variant",
121+
),
115122
vcz.ZarrArraySpec(
116123
name="variant_contig",
117124
dtype=core.min_int_dtype(0, len(np.unique(self.bed.chromosome))),

bio2zarr/tskit.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,21 +97,23 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
9797
left=self.positions[start],
9898
right=self.positions[stop] if stop < self.num_records else None,
9999
samples=self.tskit_samples,
100+
copy=False,
100101
):
101102
gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
102103
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
104+
variant_length = 0
103105
for i, allele in enumerate(variant.alleles):
104106
# None is returned by tskit in the case of a missing allele
105107
if allele is None:
106108
continue
107109
assert i < num_alleles
108110
alleles[i] = allele
109-
111+
variant_length = max(variant_length, len(allele))
110112
gt[self.sample_indices, self.ploidy_indices] = variant.genotypes[
111113
self.genotype_indices
112114
]
113115

114-
yield alleles, (gt, phased)
116+
yield vcz.VariantData(variant_length, alleles, gt, phased)
115117

116118
def generate_schema(
117119
self,
@@ -164,6 +166,16 @@ def generate_schema(
164166
min_position = np.min(self.ts.sites_position)
165167
max_position = np.max(self.ts.sites_position)
166168

169+
tables = self.ts.tables
170+
ancestral_state_offsets = tables.sites.ancestral_state_offset
171+
derived_state_offsets = tables.mutations.derived_state_offset
172+
ancestral_lengths = ancestral_state_offsets[1:] - ancestral_state_offsets[:-1]
173+
derived_lengths = derived_state_offsets[1:] - derived_state_offsets[:-1]
174+
max_variant_length = max(
175+
np.max(ancestral_lengths) if len(ancestral_lengths) > 0 else 0,
176+
np.max(derived_lengths) if len(derived_lengths) > 0 else 0,
177+
)
178+
167179
array_specs = [
168180
vcz.ZarrArraySpec(
169181
source="position",
@@ -179,6 +191,13 @@ def generate_schema(
179191
dimensions=["variants", "alleles"],
180192
description="Alleles for each variant",
181193
),
194+
vcz.ZarrArraySpec(
195+
source=None,
196+
name="variant_length",
197+
dtype=core.min_int_dtype(0, max_variant_length),
198+
dimensions=["variants"],
199+
description="Length of each variant",
200+
),
182201
vcz.ZarrArraySpec(
183202
source=None,
184203
name="variant_contig",

bio2zarr/vcf.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,14 +1040,19 @@ def iter_genotypes(self, shape, start, stop):
10401040
yield sanitised_genotypes, sanitised_phased
10411041

10421042
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
1043+
variant_lengths = self.fields["rlen"].iter_values(start, stop)
10431044
if self.gt_field is None or shape is None:
1044-
for alleles in self.iter_alleles(start, stop, num_alleles):
1045-
yield alleles, (None, None)
1045+
for variant_length, alleles in zip(
1046+
variant_lengths, self.iter_alleles(start, stop, num_alleles)
1047+
):
1048+
yield vcz.VariantData(variant_length, alleles, None, None)
10461049
else:
1047-
yield from zip(
1050+
for variant_length, alleles, (gt, phased) in zip(
1051+
variant_lengths,
10481052
self.iter_alleles(start, stop, num_alleles),
10491053
self.iter_genotypes(shape, start, stop),
1050-
)
1054+
):
1055+
yield vcz.VariantData(variant_length, alleles, gt, phased)
10511056

10521057
def generate_schema(
10531058
self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
@@ -1121,6 +1126,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11211126
compressor=compressor,
11221127
)
11231128

1129+
name_map = {field.full_name: field for field in self.metadata.fields}
11241130
array_specs = [
11251131
fixed_field_spec(
11261132
name="variant_contig",
@@ -1136,6 +1142,11 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11361142
dtype="O",
11371143
dimensions=["variants", "alleles"],
11381144
),
1145+
fixed_field_spec(
1146+
name="variant_length",
1147+
dtype=name_map["rlen"].smallest_dtype(),
1148+
dimensions=["variants"],
1149+
),
11391150
fixed_field_spec(
11401151
name="variant_id",
11411152
dtype="O",
@@ -1145,14 +1156,12 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11451156
dtype="bool",
11461157
),
11471158
]
1148-
name_map = {field.full_name: field for field in self.metadata.fields}
11491159

1150-
# Only three of the fixed fields have a direct one-to-one mapping.
1160+
# Only two of the fixed fields have a direct one-to-one mapping.
11511161
array_specs.extend(
11521162
[
11531163
spec_from_field(name_map["QUAL"], array_name="variant_quality"),
11541164
spec_from_field(name_map["POS"], array_name="variant_position"),
1155-
spec_from_field(name_map["rlen"], array_name="variant_length"),
11561165
]
11571166
)
11581167
array_specs.extend(

bio2zarr/vcz.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,16 @@
3737
}
3838

3939

40+
@dataclasses.dataclass
41+
class VariantData:
42+
"""Represents variant data returned by iter_alleles_and_genotypes."""
43+
44+
variant_length: int
45+
alleles: np.ndarray
46+
genotypes: np.ndarray
47+
phased: np.ndarray
48+
49+
4050
class Source(abc.ABC):
4151
@property
4252
@abc.abstractmethod
@@ -794,6 +804,7 @@ def encode_array_partition(self, array_spec, partition_index):
794804
def encode_alleles_and_genotypes_partition(self, partition_index):
795805
partition = self.metadata.partitions[partition_index]
796806
alleles = self.init_partition_array(partition_index, "variant_allele")
807+
variant_lengths = self.init_partition_array(partition_index, "variant_length")
797808
has_gt = self.has_genotypes()
798809
shape = None
799810
if has_gt:
@@ -802,18 +813,21 @@ def encode_alleles_and_genotypes_partition(self, partition_index):
802813
partition_index, "call_genotype_phased"
803814
)
804815
shape = gt.buff.shape[1:]
805-
for alleles_value, (genotype, phased) in self.source.iter_alleles_and_genotypes(
816+
for variant_data in self.source.iter_alleles_and_genotypes(
806817
partition.start, partition.stop, shape, alleles.array.shape[1]
807818
):
808819
j_alleles = alleles.next_buffer_row()
809-
alleles.buff[j_alleles] = alleles_value
820+
alleles.buff[j_alleles] = variant_data.alleles
821+
j_variant_length = variant_lengths.next_buffer_row()
822+
variant_lengths.buff[j_variant_length] = variant_data.variant_length
810823
if has_gt:
811824
j = gt.next_buffer_row()
812-
gt.buff[j] = genotype
825+
gt.buff[j] = variant_data.genotypes
813826
j_phased = gt_phased.next_buffer_row()
814-
gt_phased.buff[j_phased] = phased
827+
gt_phased.buff[j_phased] = variant_data.phased
815828

816829
self.finalise_partition_array(partition_index, alleles)
830+
self.finalise_partition_array(partition_index, variant_lengths)
817831
if has_gt:
818832
self.finalise_partition_array(partition_index, gt)
819833
self.finalise_partition_array(partition_index, gt_phased)
@@ -1103,14 +1117,16 @@ def __init__(self, path):
11031117
def create_index(self):
11041118
"""Create an index to support efficient region queries."""
11051119
root = zarr.open_group(store=self.path, mode="r+")
1106-
1120+
print(list(root.keys()))
11071121
if (
11081122
"variant_contig" not in root
11091123
or "variant_position" not in root
11101124
or "variant_length" not in root
11111125
):
1112-
logger.warning("Cannot create index: required arrays not found")
1113-
return
1126+
raise ValueError(
1127+
"Cannot create index: variant_contig, "
1128+
"variant_position and variant_length arrays are required"
1129+
)
11141130

11151131
contig = root["variant_contig"]
11161132
pos = root["variant_position"]

tests/data/plink/example.bim

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
1 1_10 0 10 A G
2-
1 1_20 0 20 T C
1+
1 1_10 0 10 A GG
2+
1 1_20 0 20 TTT C

tests/test_core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ def test_examples(self, chunk_size, size, start, stop):
237237
# It works in CI on Linux, but it'll probably break at some point.
238238
# It's also necessary to update these numbers each time a new data
239239
# file gets added
240-
("tests/data", 5045029),
240+
("tests/data", 5045032),
241241
("tests/data/vcf", 5018640),
242242
("tests/data/vcf/sample.vcf.gz", 1089),
243243
],

tests/test_plink.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@ class TestExample:
9494
"""
9595
.bim file looks like this:
9696
97-
1 1_10 0 10 A G
98-
1 1_20 0 20 T C
97+
1 1_10 0 10 A GG
98+
1 1_20 0 20 TTT C
9999
100100
Definition: https://www.cog-genomics.org/plink/1.9/formats#bim
101101
Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0'
@@ -121,7 +121,10 @@ def test_variant_position(self, ds):
121121
nt.assert_array_equal(ds.variant_position, [10, 20])
122122

123123
def test_variant_allele(self, ds):
124-
nt.assert_array_equal(ds.variant_allele, [["A", "G"], ["T", "C"]])
124+
nt.assert_array_equal(ds.variant_allele, [["A", "GG"], ["TTT", "C"]])
125+
126+
def test_variant_length(self, ds):
127+
nt.assert_array_equal(ds.variant_length, [2, 3])
125128

126129
def test_contig_id(self, ds):
127130
"""Test that contig identifiers are correctly extracted and stored."""
@@ -266,6 +269,9 @@ def test_chunk_size(
266269
worker_processes=worker_processes,
267270
)
268271
ds2 = sg.load_dataset(out)
272+
# Drop the region_index as it is chunk dependent
273+
ds = ds.drop_vars("region_index")
274+
ds2 = ds2.drop_vars("region_index")
269275
xt.assert_equal(ds, ds2)
270276
# TODO check array chunks
271277

@@ -372,3 +378,9 @@ def test_genotypes(self, ds):
372378

373379
def test_variant_position(self, ds):
374380
nt.assert_array_equal(ds.variant_position, [10, 20, 10, 10, 20, 10])
381+
382+
def test_variant_length(self, ds):
383+
nt.assert_array_equal(
384+
ds.variant_length,
385+
[1, 1, 1, 1, 1, 1],
386+
)

0 commit comments

Comments
 (0)