sgkit-dev · jeromekelleher · Feb 5, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,15 @@
-# 0.1.2 2024-XX-XX
+# 0.1.2 2025-02-04
 
 - Reduce memory requirement for encoding genotypes with large sample sizes
 
 - Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300)
 
 - Add chunksize options to mkschema (issue:294)
 
+- Add experimental support for local alleles.
+
+- Add experimental support for ``region_index``
+
 Breaking changes
 
 - ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x

diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -339,6 +339,11 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
     """
     Generate a schema for zarr encoding
     """
+    if local_alleles:
+        click.echo(
+            "WARNING: Local alleles support is preliminary; please use with caution.",
+            err=True,
+        )
     stream = click.get_text_stream("stdout")
     vcf2zarr.mkschema(
         icf_path,

diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py
@@ -204,14 +204,15 @@ def convert_local_allele_field_types(fields):
 
     shape = gt.shape[:-1]
     chunks = gt.chunks[:-1]
+    dimensions = gt.dimensions[:-1]
 
     la = ZarrArraySpec.new(
         vcf_field=None,
         name="call_LA",
         dtype="i1",
         shape=gt.shape,
         chunks=gt.chunks,
-        dimensions=gt.dimensions,  # FIXME
+        dimensions=(*dimensions, "local_alleles"),
         description=(
             "0-based indices into REF+ALT, indicating which alleles"
             " are relevant (local) for the current sample"
@@ -224,8 +225,8 @@ def convert_local_allele_field_types(fields):
         ad.vcf_field = None
         ad.shape = (*shape, 2)
         ad.chunks = (*chunks, 2)
+        ad.dimensions = (*dimensions, "local_alleles")
         ad.description += " (local-alleles)"
-        # TODO fix dimensions
 
     pl = fields_by_name.get("call_PL", None)
     if pl is not None:
@@ -235,7 +236,7 @@ def convert_local_allele_field_types(fields):
         pl.shape = (*shape, 3)
         pl.chunks = (*chunks, 3)
         pl.description += " (local-alleles)"
-        # TODO fix dimensions
+        pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1])
     return [*fields, la]
 
 

diff --git a/docs/vcf2zarr/overview.md b/docs/vcf2zarr/overview.md
@@ -9,6 +9,8 @@ See the {ref}`sec-vcf2zarr-tutorial` for a step-by-step introduction
 and the {ref}`sec-vcf2zarr-cli-ref` detailed documentation on
 command line options.
 
+See the [bioRxiv preprint](https://www.biorxiv.org/content/10.1101/2024.06.11.598241) for 
+further details.
 
 ## Quickstart
 
@@ -43,13 +45,18 @@ vcf2zarr inspect sample.vcz
 ### What next?
 
 VCF Zarr is a starting point in what we hope will become a diverse ecosytem
-of packages that efficiently process VCF data in Zarr format. However, this
-ecosytem does not exist yet, and there isn't much software available
-for working with the format. As such, VCF Zarr isn't suitable for end users
-who just want to get their work done for the moment.
+of packages that efficiently process VCF data in Zarr format. This 
+ecosytem is in its infancy and there isn't much software available
+for performing off-the-shelf bioinformatics tasks
+working with the format. As such, VCF Zarr isn't suitable for end users
+who just want to get their work done for the moment, and is currently
+aimed methods developers and early adopters.
 
 Having said that, you can:
 
+- Use [vcztools](https://github.com/sgkit-dev/vcztools/) as a drop-in replacment 
+  for bcftools, transparently using Zarr on local storage or cloud stores as the 
+  backend.
 - Look at the [VCF Zarr specification](https://github.com/sgkit-dev/vcf-zarr-spec/)
   to see how data is mapped from VCF to Zarr
 - Use the mature [Zarr Python](https://zarr.readthedocs.io/en/stable/) package or
@@ -59,6 +66,9 @@ your data.
 sister project to analyse the data. Note that sgkit is under active development,
 however, and the documentation may not be fully in-sync with this project.
 
+For more information, please see our 
+bioRxiv preprint [Analysis-ready VCF at Biobank scale using Zarr](
+https://www.biorxiv.org/content/10.1101/2024.06.11.598241).
 
 
 ## How does it work?
@@ -83,6 +93,56 @@ across cores on a single machine (via the ``--worker-processes`` argument)
 or distributed across a cluster by the three-part ``init``, ``partition``
 and ``finalise`` commands.
 
+## Local alleles
+
+As discussed in our [preprint](
+https://www.biorxiv.org/content/10.1101/2024.06.11.598241) 
+vcf2zarr has an experimental implementation of the local alleles data
+reduction technique. This essentially reduces the inner dimension of 
+large fields such as AD by storing information relevant only to the alleles
+involved in a particular variant call, rather than information information
+for all alleles. This can make a substantial difference when there is a large 
+number of alleles.
+
+To use local alleles, you must generate storage a schema (see the 
+{ref}`sec-vcf2zarr-tutorial-medium-dataset` section of the tutorial)
+using the {ref}`mkschema<cmd-vcf2zarr-mkschema>` command with the 
+``--local-alleles`` option. This will generate the ``call_LA`` field
+which lists the alleles observed for each genotype call, and 
+translate supported fields from their global alleles to local
+alleles representation.
+
+:::{warning}
+Support for local-alleles is preliminary and may be subject to change
+as the details of how alleles for a particular call are chosen, and the 
+number of alleles retained determined. Please open an issue on
+[GitHub](https://github.com/sgkit-dev/bio2zarr/issues/) if you would like to 
+help improve Bio2zarr's local alleles implementation.
+:::
+
+:::{note}
+Only the PL and AD fields are currently supported for local alleles
+data reduction. Please comment on our 
+[local alleles fields tracking issue](https://github.com/sgkit-dev/bio2zarr/issues/315)
+if you would like to see other fields supported, or to help out with 
+implementing more.
+:::
+
+## Debugging
+
+When things go wrong with conversion, it's very useful to generate some 
+debugging information using the ``-v`` or ``-vv`` options. These can 
+help identify what's going wrong (usually running out of memory).
+
+:::{warning}
+To get the full logging output you **must use** -p0, such that no multiprocessing
+is used. This means that tracking down problems can be slow, unfortunately.
+This is due to a bug in the way logging from child processes is handled;
+see [issue 302](https://github.com/sgkit-dev/bio2zarr/issues/302) for details.
+:::
+
+
+
 ## Copying to object stores
 
 :::{todo}

diff --git a/docs/vcf2zarr/tutorial.md b/docs/vcf2zarr/tutorial.md
@@ -83,6 +83,8 @@ chunks are *larger* than the actual arrays. This is because it's
 a tiny example, with only 9 variants and 3 samples (see the ``shape``
 column), so, for example ``call_genotype`` is only 54 bytes.
 
+
+(sec-vcf2zarr-tutorial-medium-dataset)=
 ## Medium dataset
 
 Conversion in ``vcf2zarr`` is a two step process. First we convert the VCF(s) to 

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -729,6 +729,9 @@ def test_mkschema_local_alleles(self, tmp_path, local_alleles):
         d = json.loads(result.stdout)
         call_LA_exists = "call_LA" in [f["name"] for f in d["fields"]]
         assert call_LA_exists == local_alleles
+        assert (
+            "WARNING: Local alleles support is preliminary;" in result.stderr
+        ) == local_alleles
 
     def test_encode(self, tmp_path):
         icf_path = tmp_path / "icf"

diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py
@@ -716,16 +716,19 @@ def test_call_LAD(self, ds):
             [[446, -2], [393, -2], [486, -2]],
         ]
         nt.assert_array_equal(ds.call_LAD.values, call_LAD)
+        assert ds.call_LAD.dims == ("variants", "samples", "local_alleles")
 
     def test_call_LA(self, ds):
         # All the genotypes are 0/0
         call_LA = np.full((23, 3, 2), -2)
         call_LA[:, :, 0] = 0
         nt.assert_array_equal(ds.call_LA.values, call_LA)
+        assert ds.call_LA.dims == ("variants", "samples", "local_alleles")
 
     def test_call_LPL(self, ds):
         call_LPL = np.tile([0, -2, -2], (23, 3, 1))
         nt.assert_array_equal(ds.call_LPL.values, call_LPL)
+        assert ds.call_LPL.dims == ("variants", "samples", "local_genotypes")
 
 
 class Test1000G2020AnnotationsExample:

diff --git a/tests/test_vcz.py b/tests/test_vcz.py
@@ -40,6 +40,17 @@ def schema(schema_path):
         return vcf2zarr.VcfZarrSchema.fromjson(f.read())
 
 
+@pytest.fixture(scope="module")
+def local_alleles_schema(icf_path, tmp_path_factory):
+    # FIXME: this is stupid way of getting a test fixture, should
+    # be much easier.
+    out = tmp_path_factory.mktemp("data") / "example.schema.json"
+    with open(out, "w") as f:
+        vcf2zarr.mkschema(icf_path, f, local_alleles=True)
+    with open(out) as f:
+        return vcf2zarr.VcfZarrSchema.fromjson(f.read())
+
+
 @pytest.fixture(scope="module")
 def zarr_path(icf_path, tmp_path_factory):
     out = tmp_path_factory.mktemp("data") / "example.zarr"
@@ -445,6 +456,36 @@ def test_call_GQ(self, schema):
         }
 
 
+class TestLocalAllelesDefaultSchema:
+    def test_differences(self, schema, local_alleles_schema):
+        assert len(schema.fields) == len(local_alleles_schema.fields) - 1
+        non_local = [f for f in local_alleles_schema.fields if f.name != "call_LA"]
+        assert schema.fields == non_local
+
+    def test_call_LA(self, local_alleles_schema):
+        d = get_field_dict(local_alleles_schema, "call_LA")
+        assert d == {
+            "name": "call_LA",
+            "dtype": "i1",
+            "shape": (9, 3, 2),
+            "chunks": (1000, 10000, 2),
+            "dimensions": ("variants", "samples", "local_alleles"),
+            "description": (
+                "0-based indices into REF+ALT, indicating which alleles"
+                " are relevant (local) for the current sample"
+            ),
+            "vcf_field": None,
+            "compressor": {
+                "id": "blosc",
+                "cname": "zstd",
+                "clevel": 7,
+                "shuffle": 0,
+                "blocksize": 0,
+            },
+            "filters": tuple(),
+        }
+
+
 class TestVcfDescriptions:
     @pytest.mark.parametrize(
         ("field", "description"),