Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
# 0.1.2 2024-XX-XX
# 0.1.2 2025-02-04

- Reduce memory requirement for encoding genotypes with large sample sizes

- Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300)

- Add chunksize options to mkschema (issue:294)

- Add experimental support for local alleles.

- Add experimental support for ``region_index``

Breaking changes

- ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x
Expand Down
5 changes: 5 additions & 0 deletions bio2zarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,11 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
"""
Generate a schema for zarr encoding
"""
if local_alleles:
click.echo(
"WARNING: Local alleles support is preliminary; please use with caution.",
err=True,
)
stream = click.get_text_stream("stdout")
vcf2zarr.mkschema(
icf_path,
Expand Down
7 changes: 4 additions & 3 deletions bio2zarr/vcf2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,14 +204,15 @@ def convert_local_allele_field_types(fields):

shape = gt.shape[:-1]
chunks = gt.chunks[:-1]
dimensions = gt.dimensions[:-1]

la = ZarrArraySpec.new(
vcf_field=None,
name="call_LA",
dtype="i1",
shape=gt.shape,
chunks=gt.chunks,
dimensions=gt.dimensions, # FIXME
dimensions=(*dimensions, "local_alleles"),
description=(
"0-based indices into REF+ALT, indicating which alleles"
" are relevant (local) for the current sample"
Expand All @@ -224,8 +225,8 @@ def convert_local_allele_field_types(fields):
ad.vcf_field = None
ad.shape = (*shape, 2)
ad.chunks = (*chunks, 2)
ad.dimensions = (*dimensions, "local_alleles")
ad.description += " (local-alleles)"
# TODO fix dimensions

pl = fields_by_name.get("call_PL", None)
if pl is not None:
Expand All @@ -235,7 +236,7 @@ def convert_local_allele_field_types(fields):
pl.shape = (*shape, 3)
pl.chunks = (*chunks, 3)
pl.description += " (local-alleles)"
# TODO fix dimensions
pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1])
return [*fields, la]


Expand Down
68 changes: 64 additions & 4 deletions docs/vcf2zarr/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ See the {ref}`sec-vcf2zarr-tutorial` for a step-by-step introduction
and the {ref}`sec-vcf2zarr-cli-ref` detailed documentation on
command line options.

See the [bioRxiv preprint](https://www.biorxiv.org/content/10.1101/2024.06.11.598241) for
further details.

## Quickstart

Expand Down Expand Up @@ -43,13 +45,18 @@ vcf2zarr inspect sample.vcz
### What next?

VCF Zarr is a starting point in what we hope will become a diverse ecosytem
of packages that efficiently process VCF data in Zarr format. However, this
ecosytem does not exist yet, and there isn't much software available
for working with the format. As such, VCF Zarr isn't suitable for end users
who just want to get their work done for the moment.
of packages that efficiently process VCF data in Zarr format. This
ecosytem is in its infancy and there isn't much software available
for performing off-the-shelf bioinformatics tasks
working with the format. As such, VCF Zarr isn't suitable for end users
who just want to get their work done for the moment, and is currently
aimed methods developers and early adopters.

Having said that, you can:

- Use [vcztools](https://github.com/sgkit-dev/vcztools/) as a drop-in replacment
for bcftools, transparently using Zarr on local storage or cloud stores as the
backend.
- Look at the [VCF Zarr specification](https://github.com/sgkit-dev/vcf-zarr-spec/)
to see how data is mapped from VCF to Zarr
- Use the mature [Zarr Python](https://zarr.readthedocs.io/en/stable/) package or
Expand All @@ -59,6 +66,9 @@ your data.
sister project to analyse the data. Note that sgkit is under active development,
however, and the documentation may not be fully in-sync with this project.

For more information, please see our
bioRxiv preprint [Analysis-ready VCF at Biobank scale using Zarr](
https://www.biorxiv.org/content/10.1101/2024.06.11.598241).


## How does it work?
Expand All @@ -83,6 +93,56 @@ across cores on a single machine (via the ``--worker-processes`` argument)
or distributed across a cluster by the three-part ``init``, ``partition``
and ``finalise`` commands.

## Local alleles

As discussed in our [preprint](
https://www.biorxiv.org/content/10.1101/2024.06.11.598241)
vcf2zarr has an experimental implementation of the local alleles data
reduction technique. This essentially reduces the inner dimension of
large fields such as AD by storing information relevant only to the alleles
involved in a particular variant call, rather than information information
for all alleles. This can make a substantial difference when there is a large
number of alleles.

To use local alleles, you must generate storage a schema (see the
{ref}`sec-vcf2zarr-tutorial-medium-dataset` section of the tutorial)
using the {ref}`mkschema<cmd-vcf2zarr-mkschema>` command with the
``--local-alleles`` option. This will generate the ``call_LA`` field
which lists the alleles observed for each genotype call, and
translate supported fields from their global alleles to local
alleles representation.

:::{warning}
Support for local-alleles is preliminary and may be subject to change
as the details of how alleles for a particular call are chosen, and the
number of alleles retained determined. Please open an issue on
[GitHub](https://github.com/sgkit-dev/bio2zarr/issues/) if you would like to
help improve Bio2zarr's local alleles implementation.
:::

:::{note}
Only the PL and AD fields are currently supported for local alleles
data reduction. Please comment on our
[local alleles fields tracking issue](https://github.com/sgkit-dev/bio2zarr/issues/315)
if you would like to see other fields supported, or to help out with
implementing more.
:::

## Debugging

When things go wrong with conversion, it's very useful to generate some
debugging information using the ``-v`` or ``-vv`` options. These can
help identify what's going wrong (usually running out of memory).

:::{warning}
To get the full logging output you **must use** -p0, such that no multiprocessing
is used. This means that tracking down problems can be slow, unfortunately.
This is due to a bug in the way logging from child processes is handled;
see [issue 302](https://github.com/sgkit-dev/bio2zarr/issues/302) for details.
:::



## Copying to object stores

:::{todo}
Expand Down
2 changes: 2 additions & 0 deletions docs/vcf2zarr/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ chunks are *larger* than the actual arrays. This is because it's
a tiny example, with only 9 variants and 3 samples (see the ``shape``
column), so, for example ``call_genotype`` is only 54 bytes.


(sec-vcf2zarr-tutorial-medium-dataset)=
## Medium dataset

Conversion in ``vcf2zarr`` is a two step process. First we convert the VCF(s) to
Expand Down
3 changes: 3 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,9 @@ def test_mkschema_local_alleles(self, tmp_path, local_alleles):
d = json.loads(result.stdout)
call_LA_exists = "call_LA" in [f["name"] for f in d["fields"]]
assert call_LA_exists == local_alleles
assert (
"WARNING: Local alleles support is preliminary;" in result.stderr
) == local_alleles

def test_encode(self, tmp_path):
icf_path = tmp_path / "icf"
Expand Down
3 changes: 3 additions & 0 deletions tests/test_vcf_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,16 +716,19 @@ def test_call_LAD(self, ds):
[[446, -2], [393, -2], [486, -2]],
]
nt.assert_array_equal(ds.call_LAD.values, call_LAD)
assert ds.call_LAD.dims == ("variants", "samples", "local_alleles")

def test_call_LA(self, ds):
# All the genotypes are 0/0
call_LA = np.full((23, 3, 2), -2)
call_LA[:, :, 0] = 0
nt.assert_array_equal(ds.call_LA.values, call_LA)
assert ds.call_LA.dims == ("variants", "samples", "local_alleles")

def test_call_LPL(self, ds):
call_LPL = np.tile([0, -2, -2], (23, 3, 1))
nt.assert_array_equal(ds.call_LPL.values, call_LPL)
assert ds.call_LPL.dims == ("variants", "samples", "local_genotypes")


class Test1000G2020AnnotationsExample:
Expand Down
41 changes: 41 additions & 0 deletions tests/test_vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ def schema(schema_path):
return vcf2zarr.VcfZarrSchema.fromjson(f.read())


@pytest.fixture(scope="module")
def local_alleles_schema(icf_path, tmp_path_factory):
# FIXME: this is stupid way of getting a test fixture, should
# be much easier.
out = tmp_path_factory.mktemp("data") / "example.schema.json"
with open(out, "w") as f:
vcf2zarr.mkschema(icf_path, f, local_alleles=True)
with open(out) as f:
return vcf2zarr.VcfZarrSchema.fromjson(f.read())


@pytest.fixture(scope="module")
def zarr_path(icf_path, tmp_path_factory):
out = tmp_path_factory.mktemp("data") / "example.zarr"
Expand Down Expand Up @@ -445,6 +456,36 @@ def test_call_GQ(self, schema):
}


class TestLocalAllelesDefaultSchema:
def test_differences(self, schema, local_alleles_schema):
assert len(schema.fields) == len(local_alleles_schema.fields) - 1
non_local = [f for f in local_alleles_schema.fields if f.name != "call_LA"]
assert schema.fields == non_local

def test_call_LA(self, local_alleles_schema):
d = get_field_dict(local_alleles_schema, "call_LA")
assert d == {
"name": "call_LA",
"dtype": "i1",
"shape": (9, 3, 2),
"chunks": (1000, 10000, 2),
"dimensions": ("variants", "samples", "local_alleles"),
"description": (
"0-based indices into REF+ALT, indicating which alleles"
" are relevant (local) for the current sample"
),
"vcf_field": None,
"compressor": {
"id": "blosc",
"cname": "zstd",
"clevel": 7,
"shuffle": 0,
"blocksize": 0,
},
"filters": tuple(),
}


class TestVcfDescriptions:
@pytest.mark.parametrize(
("field", "description"),
Expand Down