Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ jobs:
python -m bio2zarr vcf2zarr dencode-partition sample.vcz 1
python -m bio2zarr vcf2zarr dencode-partition sample.vcz 2
python -m bio2zarr vcf2zarr dencode-finalise sample.vcz
- name: Run tskit2zarr example
run: |
python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees sample.vcz -f
- name: Run plink2zarr example
run: |
python -m bio2zarr plink2zarr convert tests/data/plink/example sample.vcz -f
- name: Run tests
run: |
pytest --cov=bio2zarr
Expand Down Expand Up @@ -137,6 +143,14 @@ jobs:
run: |
vcfpartition --help
python -m bio2zarr vcfpartition --help
- name: Check tskit2zarr CLI
run: |
tskit2zarr --help
python -m bio2zarr tskit2zarr --help
- name: Check plink2zarr CLI
run: |
plink2zarr --help
python -m bio2zarr plink2zarr --help

test-numpy-version:
name: Test numpy versions
Expand Down
12 changes: 10 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
# 0.1.6 2025-0X-XX
# 0.1.6 2025-05-23

- Initial Python API support for VCF and tskit one-shot conversion. Format
conversion is done using the functions ``bio2zarr.vcf.convert``
and ``bio2zarr.tskit.convert``.

- Initial version of supported plink2zarr (#390, #344, #382)

- Initial version of tskit2zarr (#232)

- Make format-specific dependencies optional (#385)

- Remove bed_reader dependency (#397, #400)

- Change default number of worker processes to zero (#404) to simplify
debugging

Breaking changes
*Breaking changes*

- Remove explicit sample, contig and filter lists from the schema.
Existing ICFs will need to be recreated. (#343)
Expand Down
4 changes: 2 additions & 2 deletions bio2zarr/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ def bio2zarr():
# is handy for development and for those whose PATHs aren't set
# up in the right way.
bio2zarr.add_command(cli.vcf2zarr_main)
bio2zarr.add_command(cli.plink2zarr)
bio2zarr.add_command(cli.plink2zarr_main)
bio2zarr.add_command(cli.tskit2zarr_main)
bio2zarr.add_command(cli.vcfpartition)
bio2zarr.add_command(cli.tskit2zarr)

if __name__ == "__main__":
bio2zarr()
16 changes: 10 additions & 6 deletions bio2zarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ def vcf2zarr_main():
Convert VCF file(s) to VCF Zarr format.

See the online documentation at https://sgkit-dev.github.io/bio2zarr/

for more information.
"""

Expand All @@ -551,6 +552,7 @@ def vcf2zarr_main():
@click.command(name="convert")
@click.argument("in_path", type=click.Path())
@click.argument("zarr_path", type=click.Path())
@force
@worker_processes
@progress
@verbose
Expand All @@ -559,6 +561,7 @@ def vcf2zarr_main():
def convert_plink(
in_path,
zarr_path,
force,
verbose,
worker_processes,
progress,
Expand All @@ -571,6 +574,7 @@ def convert_plink(
then running `vcf2zarr convert tmp.vcf zarr_path`
"""
setup_logging(verbose)
check_overwrite_dir(zarr_path, force)
plink.convert(
in_path,
zarr_path,
Expand All @@ -582,15 +586,15 @@ def convert_plink(


@version
@click.group()
def plink2zarr():
@click.group(name="plink2zarr")
def plink2zarr_main():
"""
Convert plink fileset(s) to VCF Zarr format
"""
pass


plink2zarr.add_command(convert_plink)
plink2zarr_main.add_command(convert_plink)


@click.command
Expand Down Expand Up @@ -684,12 +688,12 @@ def convert_tskit(


@version
@click.group()
def tskit2zarr():
@click.group(name="tskit2zarr")
def tskit2zarr_main():
"""
Convert tskit tree sequence(s) to VCF Zarr format
"""
pass


tskit2zarr.add_command(convert_tskit)
tskit2zarr_main.add_command(convert_tskit)
42 changes: 29 additions & 13 deletions bio2zarr/plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,19 @@
BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])


def read_fam(path, sep=None):
# See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion
# on the parameters to Pandas here.
def read_fam(path):
# See: https://www.cog-genomics.org/plink/1.9/formats#fam
names = [f[0] for f in FAM_FIELDS]
df = pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python")
return df


def read_bim(path, sep=None):
def read_bim(path):
# See: https://www.cog-genomics.org/plink/1.9/formats#bim
names = [f[0] for f in BIM_FIELDS]
df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE)
df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python")
return df


Expand Down Expand Up @@ -94,6 +96,18 @@ def __init__(self, path, num_variants, num_samples):

self.byte_lookup = lookup

def iter_decode(self, start, stop, buffer_size=None):
"""
Iterate of over the variants in the specified window
with the specified approximate buffer size in bytes (default=10MiB).
"""
if buffer_size is None:
buffer_size = 10 * 1024 * 1024
variants_per_read = max(1, int(buffer_size / self.bytes_per_variant))
for off in range(start, stop, variants_per_read):
genotypes = self.decode(off, min(off + variants_per_read, stop))
yield from genotypes

def decode(self, start, stop):
chunk_size = stop - start

Expand All @@ -102,6 +116,11 @@ def decode(self, start, stop):
start_offset = 3 + (start * self.bytes_per_variant)
bytes_to_read = chunk_size * self.bytes_per_variant

logger.debug(
f"Reading {chunk_size} variants ({bytes_to_read} bytes) "
f"from {self.path}"
)

# TODO make it possible to read sequentially from the same file handle,
# seeking only when necessary.
with open(self.path, "rb") as f:
Expand Down Expand Up @@ -175,19 +194,16 @@ def iter_id(self, start, stop):
yield from self.bim.variant_id[start:stop]

def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
alt_field = self.bim.allele_1.values
ref_field = self.bim.allele_2.values
gt = self.bed_reader.decode(start, stop)
phased = np.zeros(gt.shape[:2], dtype=bool)
for i, (ref, alt) in enumerate(
zip(ref_field[start:stop], alt_field[start:stop])
):
alt_iter = self.bim.allele_1.values[start:stop]
ref_iter = self.bim.allele_2.values[start:stop]
gt_iter = self.bed_reader.iter_decode(start, stop)
for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
alleles[0] = ref
alleles[1 : 1 + len(alt)] = alt

phased = np.zeros(gt.shape[0], dtype=bool)
# rlen is the length of the REF in PLINK as there's no END annotations
yield vcz.VariantData(len(alleles[0]), alleles, gt[i], phased[i])
yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)

def generate_schema(
self,
Expand Down
10 changes: 8 additions & 2 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1608,7 +1608,7 @@ def mkschema(

def convert(
vcfs,
out_path,
vcz_path,
*,
variants_chunk_size=None,
samples_chunk_size=None,
Expand All @@ -1617,6 +1617,12 @@ def convert(
show_progress=False,
icf_path=None,
):
"""
Convert the VCF data at the specified list of paths
to VCF Zarr format stored at the specified path.

.. todo:: Document parameters
"""
if icf_path is None:
cm = temp_icf_path(prefix="vcf2zarr")
else:
Expand All @@ -1631,7 +1637,7 @@ def convert(
)
encode(
icf_path,
out_path,
vcz_path,
variants_chunk_size=variants_chunk_size,
samples_chunk_size=samples_chunk_size,
worker_processes=worker_processes,
Expand Down
1 change: 1 addition & 0 deletions bio2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,7 @@ def encode_alleles_and_genotypes_partition(self, partition_index):
partition_index, "call_genotype_phased"
)
shape = gt.buff.shape[1:]

for variant_data in self.source.iter_alleles_and_genotypes(
partition.start, partition.stop, shape, alleles.array.shape[1]
):
Expand Down
1 change: 1 addition & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ chapters:
sections:
- file: vcf2zarr/tutorial
- file: vcf2zarr/cli_ref
- file: vcf2zarr/python_api
- file: plink2zarr/overview
sections:
- file: plink2zarr/cli_ref
Expand Down
17 changes: 9 additions & 8 deletions docs/intro.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
- {ref}`sec-plink2zarr` converts PLINK 1.0 data to
[VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format.

- {ref}`sec-tskit2zarr` converts [tskit](https://tskit.dev)
data into [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format.

- {ref}`sec-vcfpartition` is a utility to split an input
VCF into a given number of partitions. This is useful for
parallel processing of VCF data.
Expand All @@ -25,13 +28,11 @@ support for other formats such as BGEN (or an interested in helping with impleme
please open an [issue on Github](https://github.com/sgkit-dev/bio2zarr/issues)
to discuss!

## Python APIs

The package is currently focused on command line interfaces, but a
Python API is also planned.
There is access to some limited functionality via Python APIs (documented
along with the respective tools). These are in beta, and should be fully
documented and stabilised in the coming releases. General APIs to enable
efficient and straightforward encoding of data to VCZ are planned
(see [issue #412](https://github.com/sgkit-dev/bio2zarr/issues/412)).

:::{warning}
Although it is possible to import the bio2zarr Python package
the APIs are purely internal for the moment and will change
in arbitrary ways. Please don't use them (or open issues about
them on GitHub).
:::
17 changes: 17 additions & 0 deletions docs/vcf2zarr/python_api.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
(sec-vcf2zarr-python-api)=
# Python API

Basic usage:
```python
import bio2zarr.vcf as v2z

v2z.convert([vcf_path], vcz_path)
```

## API reference

```{eval-rst}

.. autofunction:: bio2zarr.vcf.convert

```
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ documentation = "https://sgkit-dev.github.io/bio2zarr/"
vcf2zarr = "bio2zarr.cli:vcf2zarr_main"
vcfpartition = "bio2zarr.cli:vcfpartition"
tskit2zarr = "bio2zarr.cli:tskit2zarr_main"
plink2zarr = "bio2zarr.cli:plink2zarr_main"

[project.optional-dependencies]
dev = [
Expand Down
Loading