diff --git a/CHANGELOG.md b/CHANGELOG.md index f97f6e21..b80dfb7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -# 0.1.2 2024-XX-XX +# 0.1.2 2025-02-04 - Reduce memory requirement for encoding genotypes with large sample sizes @@ -6,6 +6,10 @@ - Add chunksize options to mkschema (issue:294) +- Add experimental support for local alleles. + +- Add experimental support for ``region_index`` + Breaking changes - ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 83da13f9..78f36041 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -339,6 +339,11 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles): """ Generate a schema for zarr encoding """ + if local_alleles: + click.echo( + "WARNING: Local alleles support is preliminary; please use with caution.", + err=True, + ) stream = click.get_text_stream("stdout") vcf2zarr.mkschema( icf_path, diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 3bab2b12..074e0712 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -204,6 +204,7 @@ def convert_local_allele_field_types(fields): shape = gt.shape[:-1] chunks = gt.chunks[:-1] + dimensions = gt.dimensions[:-1] la = ZarrArraySpec.new( vcf_field=None, @@ -211,7 +212,7 @@ def convert_local_allele_field_types(fields): dtype="i1", shape=gt.shape, chunks=gt.chunks, - dimensions=gt.dimensions, # FIXME + dimensions=(*dimensions, "local_alleles"), description=( "0-based indices into REF+ALT, indicating which alleles" " are relevant (local) for the current sample" @@ -224,8 +225,8 @@ def convert_local_allele_field_types(fields): ad.vcf_field = None ad.shape = (*shape, 2) ad.chunks = (*chunks, 2) + ad.dimensions = (*dimensions, "local_alleles") ad.description += " (local-alleles)" - # TODO fix dimensions pl = fields_by_name.get("call_PL", None) if pl is not None: @@ -235,7 +236,7 @@ def convert_local_allele_field_types(fields): pl.shape = (*shape, 3) pl.chunks = (*chunks, 3) pl.description += " (local-alleles)" - # TODO fix dimensions + pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1]) return [*fields, la] diff --git a/docs/vcf2zarr/overview.md b/docs/vcf2zarr/overview.md index 6611003a..c6fad0de 100644 --- a/docs/vcf2zarr/overview.md +++ b/docs/vcf2zarr/overview.md @@ -9,6 +9,8 @@ See the {ref}`sec-vcf2zarr-tutorial` for a step-by-step introduction and the {ref}`sec-vcf2zarr-cli-ref` detailed documentation on command line options. +See the [bioRxiv preprint](https://www.biorxiv.org/content/10.1101/2024.06.11.598241) for +further details. ## Quickstart @@ -43,13 +45,18 @@ vcf2zarr inspect sample.vcz ### What next? VCF Zarr is a starting point in what we hope will become a diverse ecosytem -of packages that efficiently process VCF data in Zarr format. However, this -ecosytem does not exist yet, and there isn't much software available -for working with the format. As such, VCF Zarr isn't suitable for end users -who just want to get their work done for the moment. +of packages that efficiently process VCF data in Zarr format. This +ecosytem is in its infancy and there isn't much software available +for performing off-the-shelf bioinformatics tasks +working with the format. As such, VCF Zarr isn't suitable for end users +who just want to get their work done for the moment, and is currently +aimed methods developers and early adopters. Having said that, you can: +- Use [vcztools](https://github.com/sgkit-dev/vcztools/) as a drop-in replacment + for bcftools, transparently using Zarr on local storage or cloud stores as the + backend. - Look at the [VCF Zarr specification](https://github.com/sgkit-dev/vcf-zarr-spec/) to see how data is mapped from VCF to Zarr - Use the mature [Zarr Python](https://zarr.readthedocs.io/en/stable/) package or @@ -59,6 +66,9 @@ your data. sister project to analyse the data. Note that sgkit is under active development, however, and the documentation may not be fully in-sync with this project. +For more information, please see our +bioRxiv preprint [Analysis-ready VCF at Biobank scale using Zarr]( +https://www.biorxiv.org/content/10.1101/2024.06.11.598241). ## How does it work? @@ -83,6 +93,56 @@ across cores on a single machine (via the ``--worker-processes`` argument) or distributed across a cluster by the three-part ``init``, ``partition`` and ``finalise`` commands. +## Local alleles + +As discussed in our [preprint]( +https://www.biorxiv.org/content/10.1101/2024.06.11.598241) +vcf2zarr has an experimental implementation of the local alleles data +reduction technique. This essentially reduces the inner dimension of +large fields such as AD by storing information relevant only to the alleles +involved in a particular variant call, rather than information information +for all alleles. This can make a substantial difference when there is a large +number of alleles. + +To use local alleles, you must generate storage a schema (see the +{ref}`sec-vcf2zarr-tutorial-medium-dataset` section of the tutorial) +using the {ref}`mkschema` command with the +``--local-alleles`` option. This will generate the ``call_LA`` field +which lists the alleles observed for each genotype call, and +translate supported fields from their global alleles to local +alleles representation. + +:::{warning} +Support for local-alleles is preliminary and may be subject to change +as the details of how alleles for a particular call are chosen, and the +number of alleles retained determined. Please open an issue on +[GitHub](https://github.com/sgkit-dev/bio2zarr/issues/) if you would like to +help improve Bio2zarr's local alleles implementation. +::: + +:::{note} +Only the PL and AD fields are currently supported for local alleles +data reduction. Please comment on our +[local alleles fields tracking issue](https://github.com/sgkit-dev/bio2zarr/issues/315) +if you would like to see other fields supported, or to help out with +implementing more. +::: + +## Debugging + +When things go wrong with conversion, it's very useful to generate some +debugging information using the ``-v`` or ``-vv`` options. These can +help identify what's going wrong (usually running out of memory). + +:::{warning} +To get the full logging output you **must use** -p0, such that no multiprocessing +is used. This means that tracking down problems can be slow, unfortunately. +This is due to a bug in the way logging from child processes is handled; +see [issue 302](https://github.com/sgkit-dev/bio2zarr/issues/302) for details. +::: + + + ## Copying to object stores :::{todo} diff --git a/docs/vcf2zarr/tutorial.md b/docs/vcf2zarr/tutorial.md index 5d800524..2125096e 100644 --- a/docs/vcf2zarr/tutorial.md +++ b/docs/vcf2zarr/tutorial.md @@ -83,6 +83,8 @@ chunks are *larger* than the actual arrays. This is because it's a tiny example, with only 9 variants and 3 samples (see the ``shape`` column), so, for example ``call_genotype`` is only 54 bytes. + +(sec-vcf2zarr-tutorial-medium-dataset)= ## Medium dataset Conversion in ``vcf2zarr`` is a two step process. First we convert the VCF(s) to diff --git a/tests/test_cli.py b/tests/test_cli.py index c1411a97..2253dfaf 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -729,6 +729,9 @@ def test_mkschema_local_alleles(self, tmp_path, local_alleles): d = json.loads(result.stdout) call_LA_exists = "call_LA" in [f["name"] for f in d["fields"]] assert call_LA_exists == local_alleles + assert ( + "WARNING: Local alleles support is preliminary;" in result.stderr + ) == local_alleles def test_encode(self, tmp_path): icf_path = tmp_path / "icf" diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 06a0aaf5..64e3b26e 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -716,16 +716,19 @@ def test_call_LAD(self, ds): [[446, -2], [393, -2], [486, -2]], ] nt.assert_array_equal(ds.call_LAD.values, call_LAD) + assert ds.call_LAD.dims == ("variants", "samples", "local_alleles") def test_call_LA(self, ds): # All the genotypes are 0/0 call_LA = np.full((23, 3, 2), -2) call_LA[:, :, 0] = 0 nt.assert_array_equal(ds.call_LA.values, call_LA) + assert ds.call_LA.dims == ("variants", "samples", "local_alleles") def test_call_LPL(self, ds): call_LPL = np.tile([0, -2, -2], (23, 3, 1)) nt.assert_array_equal(ds.call_LPL.values, call_LPL) + assert ds.call_LPL.dims == ("variants", "samples", "local_genotypes") class Test1000G2020AnnotationsExample: diff --git a/tests/test_vcz.py b/tests/test_vcz.py index 48fcc1d9..1d15568b 100644 --- a/tests/test_vcz.py +++ b/tests/test_vcz.py @@ -40,6 +40,17 @@ def schema(schema_path): return vcf2zarr.VcfZarrSchema.fromjson(f.read()) +@pytest.fixture(scope="module") +def local_alleles_schema(icf_path, tmp_path_factory): + # FIXME: this is stupid way of getting a test fixture, should + # be much easier. + out = tmp_path_factory.mktemp("data") / "example.schema.json" + with open(out, "w") as f: + vcf2zarr.mkschema(icf_path, f, local_alleles=True) + with open(out) as f: + return vcf2zarr.VcfZarrSchema.fromjson(f.read()) + + @pytest.fixture(scope="module") def zarr_path(icf_path, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.zarr" @@ -445,6 +456,36 @@ def test_call_GQ(self, schema): } +class TestLocalAllelesDefaultSchema: + def test_differences(self, schema, local_alleles_schema): + assert len(schema.fields) == len(local_alleles_schema.fields) - 1 + non_local = [f for f in local_alleles_schema.fields if f.name != "call_LA"] + assert schema.fields == non_local + + def test_call_LA(self, local_alleles_schema): + d = get_field_dict(local_alleles_schema, "call_LA") + assert d == { + "name": "call_LA", + "dtype": "i1", + "shape": (9, 3, 2), + "chunks": (1000, 10000, 2), + "dimensions": ("variants", "samples", "local_alleles"), + "description": ( + "0-based indices into REF+ALT, indicating which alleles" + " are relevant (local) for the current sample" + ), + "vcf_field": None, + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 0, + "blocksize": 0, + }, + "filters": tuple(), + } + + class TestVcfDescriptions: @pytest.mark.parametrize( ("field", "description"),