Skip to content
Merged
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,11 @@ jobs:
python -m venv env-plink
source env-plink/bin/activate
python -m pip install .
python -m bio2zarr plink2zarr convert tests/data/plink/example.bed plink.vcz > plink.txt 2>&1 || echo $? > plink_exit.txt
python -m bio2zarr plink2zarr convert tests/data/plink/example plink.vcz > plink.txt 2>&1 || echo $? > plink_exit.txt
test "$(cat plink_exit.txt)" = "1"
grep -q "This process requires the optional bed_reader module. Install it with: pip install bio2zarr\[plink\]" plink.txt
python -m pip install '.[plink]'
python -m bio2zarr plink2zarr convert tests/data/plink/example.bed plink.vcz
python -m bio2zarr plink2zarr convert tests/data/plink/example plink.vcz
deactivate

python -m venv env-vcf
Expand Down
8 changes: 4 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# 0.1.6 2025-0X-XX

- Make format-specific dependencies optional (#385)

- Add contigs to plink output (#344)
- Initial version of supported plink2zarr (#390, #344, #382)

- Add variant_length and indexing to plink output (#382)
- Make format-specific dependencies optional (#385)

Breaking changes

Expand All @@ -14,6 +12,8 @@ Breaking changes
- Add dimensions and default compressor and filter settings to the schema.
(#361)

- Various changes to existing experimental plink encoding (#390)

# 0.1.5 2025-03-31

- Add support for merging contig IDs across multiple VCFs (#335)
Expand Down
16 changes: 12 additions & 4 deletions bio2zarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ def encode(
worker_processes,
):
"""
Convert intermediate columnar format to vcfzarr.
Convert intermediate columnar format to VCF Zarr.
"""
setup_logging(verbose)
check_overwrite_dir(zarr_path, force)
Expand Down Expand Up @@ -504,7 +504,7 @@ def convert_vcf(
local_alleles,
):
"""
Convert input VCF(s) directly to vcfzarr (not recommended for large files).
Convert input VCF(s) directly to VCF Zarr (not recommended for large files).
"""
setup_logging(verbose)
check_overwrite_dir(zarr_path, force)
Expand All @@ -523,7 +523,7 @@ def convert_vcf(
@click.group(cls=NaturalOrderGroup, name="vcf2zarr")
def vcf2zarr_main():
"""
Convert VCF file(s) to the vcfzarr format.
Convert VCF file(s) to VCF Zarr format.

See the online documentation at https://sgkit-dev.github.io/bio2zarr/
for more information.
Expand Down Expand Up @@ -561,7 +561,9 @@ def convert_plink(
samples_chunk_size,
):
"""
In development; DO NOT USE!
Convert plink fileset to VCF Zarr. Results are equivalent to
`plink1.9 --bfile prefix --keep-allele-order --recode vcf-iid --out tmp`
then running `vcf2zarr convert tmp.vcf zarr_path`
"""
setup_logging(verbose)
plink.convert(
Expand All @@ -577,6 +579,9 @@ def convert_plink(
@version
@click.group()
def plink2zarr():
"""
Convert plink fileset(s) to VCF Zarr format
"""
pass


Expand Down Expand Up @@ -676,6 +681,9 @@ def convert_tskit(
@version
@click.group()
def tskit2zarr():
"""
Convert tskit tree sequence(s) to VCF Zarr format
"""
pass


Expand Down
104 changes: 54 additions & 50 deletions bio2zarr/plink.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,46 @@
import dataclasses
import logging
import pathlib

import numpy as np
import zarr

from bio2zarr import constants, core, vcz

logger = logging.getLogger(__name__)


@dataclasses.dataclass
class PlinkPaths:
bed_path: str
bim_path: str
fam_path: str


class PlinkFormat(vcz.Source):
@core.requires_optional_dependency("bed_reader", "plink")
def __init__(self, path):
def __init__(self, prefix):
import bed_reader

self._path = pathlib.Path(path)
self.bed = bed_reader.open_bed(path, num_threads=1, count_A1=False)
# TODO we will need support multiple chromosomes here to join
# plinks into on big zarr. So, these will require multiple
# bed and bim files, but should share a .fam
self.prefix = str(prefix)
paths = PlinkPaths(
self.prefix + ".bed",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be nicer to have the PlinkPaths __init__ just take the prefix?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pros and cons - I tend to try and keep these dataclasses as simple as possible for reasoning about serialising and so on.

self.prefix + ".bim",
self.prefix + ".fam",
)
self.bed = bed_reader.open_bed(
paths.bed_path,
bim_location=paths.bim_path,
fam_location=paths.fam_path,
num_threads=1,
count_A1=True,
)

@property
def path(self):
return self._path
return self.prefix

@property
def num_records(self):
Expand All @@ -46,9 +67,12 @@ def iter_field(self, field_name, shape, start, stop):
assert field_name == "position" # Only position field is supported from plink
yield from self.bed.bp_position[start:stop]

def iter_id(self, start, stop):
yield from self.bed.sid[start:stop]

def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
ref_field = self.bed.allele_1
alt_field = self.bed.allele_2
alt_field = self.bed.allele_1
ref_field = self.bed.allele_2
bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T
gt = np.zeros(shape, dtype=np.int8)
phased = np.zeros(shape[:-1], dtype=bool)
Expand All @@ -61,9 +85,10 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
gt[:] = 0
gt[bed_chunk[i] == -127] = -1
gt[bed_chunk[i] == 2] = 1
gt[bed_chunk[i] == 1, 0] = 1
gt[bed_chunk[i] == 1, 1] = 1

yield vcz.VariantData(max(len(a) for a in alleles), alleles, gt, phased)
# rlen is the length of the REF in PLINK as there's no END annotations
yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)

def generate_schema(
self,
Expand Down Expand Up @@ -92,6 +117,9 @@ def generate_schema(
f"variants={dimensions['variants'].chunk_size}, "
f"samples={dimensions['samples'].chunk_size}"
)
# If we don't have SVLEN or END annotations, the rlen field is defined
# as the length of the REF
max_len = self.bed.allele_2.itemsize

array_specs = [
vcz.ZarrArraySpec(
Expand All @@ -107,10 +135,22 @@ def generate_schema(
dimensions=["variants", "alleles"],
description=None,
),
vcz.ZarrArraySpec(
name="variant_id",
dtype="O",
dimensions=["variants"],
description=None,
),
vcz.ZarrArraySpec(
name="variant_id_mask",
dtype="bool",
dimensions=["variants"],
description=None,
),
vcz.ZarrArraySpec(
source=None,
name="variant_length",
dtype="i4",
dtype=core.min_int_dtype(0, max_len),
dimensions=["variants"],
description="Length of each variant",
),
Expand Down Expand Up @@ -147,20 +187,20 @@ def generate_schema(


def convert(
bed_path,
zarr_path,
prefix,
out,
*,
variants_chunk_size=None,
samples_chunk_size=None,
worker_processes=1,
show_progress=False,
):
plink_format = PlinkFormat(bed_path)
plink_format = PlinkFormat(prefix)
schema_instance = plink_format.generate_schema(
variants_chunk_size=variants_chunk_size,
samples_chunk_size=samples_chunk_size,
)
zarr_path = pathlib.Path(zarr_path)
zarr_path = pathlib.Path(out)
vzw = vcz.VcfZarrWriter(PlinkFormat, zarr_path)
# Rough heuristic to split work up enough to keep utilisation high
target_num_partitions = max(1, worker_processes * 4)
Expand All @@ -175,39 +215,3 @@ def convert(
)
vzw.finalise(show_progress)
vzw.create_index()


# FIXME do this more efficiently - currently reading the whole thing
# in for convenience, and also comparing call-by-call
@core.requires_optional_dependency("bed_reader", "plink")
def validate(bed_path, zarr_path):
import bed_reader

root = zarr.open(store=zarr_path, mode="r")
call_genotype = root["call_genotype"][:]

bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)

assert call_genotype.shape[0] == bed.sid_count
assert call_genotype.shape[1] == bed.iid_count
bed_genotypes = bed.read(dtype="int8").T
assert call_genotype.shape[0] == bed_genotypes.shape[0]
assert call_genotype.shape[1] == bed_genotypes.shape[1]
assert call_genotype.shape[2] == 2

row_id = 0
for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
# print("ROW", row_id)
# print(bed_row, zarr_row)
row_id += 1
for bed_call, zarr_call in zip(bed_row, zarr_row):
if bed_call == -127:
assert list(zarr_call) == [-1, -1]
elif bed_call == 0:
assert list(zarr_call) == [0, 0]
elif bed_call == 1:
assert list(zarr_call) == [1, 0]
elif bed_call == 2:
assert list(zarr_call) == [1, 1]
else: # pragma no cover
raise AssertionError(f"Unexpected bed call {bed_call}")
3 changes: 3 additions & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ chapters:
sections:
- file: vcf2zarr/tutorial
- file: vcf2zarr/cli_ref
- file: plink2zarr/overview
sections:
- file: plink2zarr/cli_ref
- file: vcfpartition/overview
sections:
- file: vcfpartition/cli_ref
7 changes: 4 additions & 3 deletions docs/intro.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
- {ref}`sec-vcf2zarr` converts VCF data to
[VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format.

- {ref}`sec-plink2zarr` converts PLINK 1.0 data to
[VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format.

- {ref}`sec-vcfpartition` is a utility to split an input
VCF into a given number of partitions. This is useful for
parallel processing of VCF data.
Expand All @@ -17,10 +20,8 @@
`bio2zarr` is in development, contributions, feedback and issues are welcome
at the [GitHub repository](https://github.com/sgkit-dev/bio2zarr).

Support for converting PLINK data to VCF Zarr is partially implemented,
and adding BGEN and [tskit](https://tskit.dev/) support is also planned.
If you would like to see
support for other formats (or an interested in helping with implementing),
support for other formats such as BGEN (or an interested in helping with implementing),
please open an [issue on Github](https://github.com/sgkit-dev/bio2zarr/issues)
to discuss!

Expand Down
17 changes: 17 additions & 0 deletions docs/plink2zarr/cli_ref.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
(sec-plink2zarr-cli-ref)=
# CLI Reference

% A note on cross references... There's some weird long-standing problem with
% cross referencing program values in Sphinx, which means that we can't use
% the built-in labels generated by sphinx-click. We can make our own explicit
% targets, but these have to have slightly weird names to avoid conflicting
% with what sphinx-click is doing. So, hence the cmd- prefix.
% Based on: https://github.com/skypilot-org/skypilot/pull/2834

```{eval-rst}

.. _cmd-plink2zarr-convert:
.. click:: bio2zarr.cli:convert_plink
:prog: plink2zarr convert
:nested: full

38 changes: 38 additions & 0 deletions docs/plink2zarr/overview.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
(sec-plink2zarr)=
# plink2zarr

Convert plink data to the
[VCF Zarr specification](https://github.com/sgkit-dev/vcf-zarr-spec/)
reliably in parallel.

See {ref}`sec-plink2zarr-cli-ref` for detailed documentation on
command line options.

Conversion of the plink data model to VCF follows the semantics of plink1.9 as closely
as possible. That is, given a binary plink fileset with prefix "fileset" (i.e.,
fileset.bed, fileset.bim, fileset.fam), running
```
$ plink2zarr convert fileset out.vcz
```
should produce the same result in ``out.vcz`` as
```
$ plink1.9 --bfile fileset --keep-allele-order --recode vcf-iid --out tmp
$ vcf2zarr convert tmp.vcf out.vcz
```

:::{warning}
It is important to note that we follow the same conventions as plink 2.0
where the A1 allele in the [bim file](https://www.cog-genomics.org/plink/2.0/formats#bim)
is the VCF ALT and A2 is the REF.
:::

:::{note}
Currently we only convert the basic VCF-like data from plink, and don't include
phenotypes and pedigree information. These are planned as future enhancements.
Please comment on [this issue](https://github.com/sgkit-dev/bio2zarr/issues/392)
if you are interested in this functionality.
:::




9 changes: 9 additions & 0 deletions tests/data/plink/example.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.2
##fileDate=20250521
##source=PLINKv1.90
##contig=<ID=1,length=21>
##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional reference allele, may not be based on real reference genome">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ind0 ind1 ind2 ind3 ind4 ind5 ind6 ind7 ind8 ind9
1 10 1_10 GG A . . PR GT 1/1 1/1 1/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0
1 20 1_20 C TTT . . PR GT 1/1 1/1 1/1 1/1 0/0 0/0 0/0 0/0 0/0 0/0
9 changes: 9 additions & 0 deletions tests/data/plink/example_with_fam.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.2
##fileDate=20250521
##source=PLINKv1.90
##contig=<ID=1,length=21>
##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional reference allele, may not be based on real reference genome">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ind0 ind1 ind2 ind3 ind4 ind5 ind6 ind7 ind8 ind9
1 10 1_10 G A . . PR GT 1/1 1/1 1/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0
1 20 1_20 C T . . PR GT 1/1 1/1 1/1 1/1 0/0 0/0 0/0 0/0 0/0 0/0
Loading