Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

- Add contigs to plink output (#344)

- Add variant_length and indexing to plink output (#382)

Breaking changes

- Remove explicit sample, contig and filter lists from the schema.
Expand Down
9 changes: 8 additions & 1 deletion bio2zarr/plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
gt[bed_chunk[i] == 2] = 1
gt[bed_chunk[i] == 1, 0] = 1

yield alleles, (gt, phased)
yield vcz.VariantData(max(len(a) for a in alleles), alleles, gt, phased)

def generate_schema(
self,
Expand Down Expand Up @@ -112,6 +112,13 @@ def generate_schema(
dimensions=["variants", "alleles"],
description=None,
),
vcz.ZarrArraySpec(
source=None,
name="variant_length",
dtype="i4",
dimensions=["variants"],
description="Length of each variant",
),
vcz.ZarrArraySpec(
name="variant_contig",
dtype=core.min_int_dtype(0, len(np.unique(self.bed.chromosome))),
Expand Down
23 changes: 21 additions & 2 deletions bio2zarr/tskit.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,21 +97,23 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
left=self.positions[start],
right=self.positions[stop] if stop < self.num_records else None,
samples=self.tskit_samples,
copy=False,
):
gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
variant_length = 0
for i, allele in enumerate(variant.alleles):
# None is returned by tskit in the case of a missing allele
if allele is None:
continue
assert i < num_alleles
alleles[i] = allele

variant_length = max(variant_length, len(allele))
gt[self.sample_indices, self.ploidy_indices] = variant.genotypes[
self.genotype_indices
]

yield alleles, (gt, phased)
yield vcz.VariantData(variant_length, alleles, gt, phased)

def generate_schema(
self,
Expand Down Expand Up @@ -164,6 +166,16 @@ def generate_schema(
min_position = np.min(self.ts.sites_position)
max_position = np.max(self.ts.sites_position)

tables = self.ts.tables
ancestral_state_offsets = tables.sites.ancestral_state_offset
derived_state_offsets = tables.mutations.derived_state_offset
ancestral_lengths = ancestral_state_offsets[1:] - ancestral_state_offsets[:-1]
derived_lengths = derived_state_offsets[1:] - derived_state_offsets[:-1]
max_variant_length = max(
np.max(ancestral_lengths) if len(ancestral_lengths) > 0 else 0,
np.max(derived_lengths) if len(derived_lengths) > 0 else 0,
)

array_specs = [
vcz.ZarrArraySpec(
source="position",
Expand All @@ -179,6 +191,13 @@ def generate_schema(
dimensions=["variants", "alleles"],
description="Alleles for each variant",
),
vcz.ZarrArraySpec(
source=None,
name="variant_length",
dtype=core.min_int_dtype(0, max_variant_length),
dimensions=["variants"],
description="Length of each variant",
),
vcz.ZarrArraySpec(
source=None,
name="variant_contig",
Expand Down
23 changes: 16 additions & 7 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1040,14 +1040,19 @@ def iter_genotypes(self, shape, start, stop):
yield sanitised_genotypes, sanitised_phased

def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
variant_lengths = self.fields["rlen"].iter_values(start, stop)
if self.gt_field is None or shape is None:
for alleles in self.iter_alleles(start, stop, num_alleles):
yield alleles, (None, None)
for variant_length, alleles in zip(
variant_lengths, self.iter_alleles(start, stop, num_alleles)
):
yield vcz.VariantData(variant_length, alleles, None, None)
else:
yield from zip(
for variant_length, alleles, (gt, phased) in zip(
variant_lengths,
self.iter_alleles(start, stop, num_alleles),
self.iter_genotypes(shape, start, stop),
)
):
yield vcz.VariantData(variant_length, alleles, gt, phased)

def generate_schema(
self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
Expand Down Expand Up @@ -1121,6 +1126,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
compressor=compressor,
)

name_map = {field.full_name: field for field in self.metadata.fields}
array_specs = [
fixed_field_spec(
name="variant_contig",
Expand All @@ -1136,6 +1142,11 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
dtype="O",
dimensions=["variants", "alleles"],
),
fixed_field_spec(
name="variant_length",
dtype=name_map["rlen"].smallest_dtype(),
dimensions=["variants"],
),
fixed_field_spec(
name="variant_id",
dtype="O",
Expand All @@ -1145,14 +1156,12 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
dtype="bool",
),
]
name_map = {field.full_name: field for field in self.metadata.fields}

# Only three of the fixed fields have a direct one-to-one mapping.
# Only two of the fixed fields have a direct one-to-one mapping.
array_specs.extend(
[
spec_from_field(name_map["QUAL"], array_name="variant_quality"),
spec_from_field(name_map["POS"], array_name="variant_position"),
spec_from_field(name_map["rlen"], array_name="variant_length"),
]
)
array_specs.extend(
Expand Down
30 changes: 23 additions & 7 deletions bio2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@
}


@dataclasses.dataclass
class VariantData:
"""Represents variant data returned by iter_alleles_and_genotypes."""

variant_length: int
alleles: np.ndarray
genotypes: np.ndarray
phased: np.ndarray


class Source(abc.ABC):
@property
@abc.abstractmethod
Expand Down Expand Up @@ -794,6 +804,7 @@ def encode_array_partition(self, array_spec, partition_index):
def encode_alleles_and_genotypes_partition(self, partition_index):
partition = self.metadata.partitions[partition_index]
alleles = self.init_partition_array(partition_index, "variant_allele")
variant_lengths = self.init_partition_array(partition_index, "variant_length")
has_gt = self.has_genotypes()
shape = None
if has_gt:
Expand All @@ -802,18 +813,21 @@ def encode_alleles_and_genotypes_partition(self, partition_index):
partition_index, "call_genotype_phased"
)
shape = gt.buff.shape[1:]
for alleles_value, (genotype, phased) in self.source.iter_alleles_and_genotypes(
for variant_data in self.source.iter_alleles_and_genotypes(
partition.start, partition.stop, shape, alleles.array.shape[1]
):
j_alleles = alleles.next_buffer_row()
alleles.buff[j_alleles] = alleles_value
alleles.buff[j_alleles] = variant_data.alleles
j_variant_length = variant_lengths.next_buffer_row()
variant_lengths.buff[j_variant_length] = variant_data.variant_length
if has_gt:
j = gt.next_buffer_row()
gt.buff[j] = genotype
gt.buff[j] = variant_data.genotypes
j_phased = gt_phased.next_buffer_row()
gt_phased.buff[j_phased] = phased
gt_phased.buff[j_phased] = variant_data.phased

self.finalise_partition_array(partition_index, alleles)
self.finalise_partition_array(partition_index, variant_lengths)
if has_gt:
self.finalise_partition_array(partition_index, gt)
self.finalise_partition_array(partition_index, gt_phased)
Expand Down Expand Up @@ -1103,14 +1117,16 @@ def __init__(self, path):
def create_index(self):
"""Create an index to support efficient region queries."""
root = zarr.open_group(store=self.path, mode="r+")

print(list(root.keys()))
if (
"variant_contig" not in root
or "variant_position" not in root
or "variant_length" not in root
):
logger.warning("Cannot create index: required arrays not found")
return
raise ValueError(
"Cannot create index: variant_contig, "
"variant_position and variant_length arrays are required"
)

contig = root["variant_contig"]
pos = root["variant_position"]
Expand Down
4 changes: 2 additions & 2 deletions tests/data/plink/example.bim
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
1 1_10 0 10 A G
1 1_20 0 20 T C
1 1_10 0 10 A GG
1 1_20 0 20 TTT C
2 changes: 1 addition & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def test_examples(self, chunk_size, size, start, stop):
# It works in CI on Linux, but it'll probably break at some point.
# It's also necessary to update these numbers each time a new data
# file gets added
("tests/data", 5045029),
("tests/data", 5045032),
("tests/data/vcf", 5018640),
("tests/data/vcf/sample.vcf.gz", 1089),
],
Expand Down
18 changes: 15 additions & 3 deletions tests/test_plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ class TestExample:
"""
.bim file looks like this:

1 1_10 0 10 A G
1 1_20 0 20 T C
1 1_10 0 10 A GG
1 1_20 0 20 TTT C

Definition: https://www.cog-genomics.org/plink/1.9/formats#bim
Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0'
Expand All @@ -121,7 +121,10 @@ def test_variant_position(self, ds):
nt.assert_array_equal(ds.variant_position, [10, 20])

def test_variant_allele(self, ds):
nt.assert_array_equal(ds.variant_allele, [["A", "G"], ["T", "C"]])
nt.assert_array_equal(ds.variant_allele, [["A", "GG"], ["TTT", "C"]])

def test_variant_length(self, ds):
nt.assert_array_equal(ds.variant_length, [2, 3])

def test_contig_id(self, ds):
"""Test that contig identifiers are correctly extracted and stored."""
Expand Down Expand Up @@ -266,6 +269,9 @@ def test_chunk_size(
worker_processes=worker_processes,
)
ds2 = sg.load_dataset(out)
# Drop the region_index as it is chunk dependent
ds = ds.drop_vars("region_index")
ds2 = ds2.drop_vars("region_index")
xt.assert_equal(ds, ds2)
# TODO check array chunks

Expand Down Expand Up @@ -372,3 +378,9 @@ def test_genotypes(self, ds):

def test_variant_position(self, ds):
nt.assert_array_equal(ds.variant_position, [10, 20, 10, 10, 20, 10])

def test_variant_length(self, ds):
nt.assert_array_equal(
ds.variant_length,
[1, 1, 1, 1, 1, 1],
)
Loading