sgkit-dev · jeromekelleher · May 21, 2025 · May 20, 2025 · May 20, 2025 · May 20, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -104,11 +104,11 @@ jobs:
           python -m venv env-plink
           source env-plink/bin/activate
           python -m pip install .
-          python -m bio2zarr plink2zarr convert tests/data/plink/example.bed plink.vcz > plink.txt 2>&1 || echo $? > plink_exit.txt
+          python -m bio2zarr plink2zarr convert tests/data/plink/example plink.vcz > plink.txt 2>&1 || echo $? > plink_exit.txt
           test "$(cat plink_exit.txt)" = "1"
           grep -q "This process requires the optional bed_reader module. Install it with: pip install bio2zarr\[plink\]" plink.txt
           python -m pip install '.[plink]'
-          python -m bio2zarr plink2zarr convert tests/data/plink/example.bed plink.vcz
+          python -m bio2zarr plink2zarr convert tests/data/plink/example plink.vcz
           deactivate
 
           python -m venv env-vcf

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,8 @@
 # 0.1.6 2025-0X-XX
 
-- Make format-specific dependencies optional (#385)
-
-- Add contigs to plink output (#344)
+- Initial version of supported plink2zarr (#390, #344, #382)
 
-- Add variant_length and indexing to plink output (#382)
+- Make format-specific dependencies optional (#385)
 
 Breaking changes
 
@@ -14,6 +12,8 @@ Breaking changes
 - Add dimensions and default compressor and filter settings to the schema.
   (#361)
 
+- Various changes to existing experimental plink encoding (#390)
+
 # 0.1.5 2025-03-31
 
 - Add support for merging contig IDs across multiple VCFs (#335)

diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -381,7 +381,7 @@ def encode(
     worker_processes,
 ):
     """
-    Convert intermediate columnar format to vcfzarr.
+    Convert intermediate columnar format to VCF Zarr.
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)
@@ -504,7 +504,7 @@ def convert_vcf(
     local_alleles,
 ):
     """
-    Convert input VCF(s) directly to vcfzarr (not recommended for large files).
+    Convert input VCF(s) directly to VCF Zarr (not recommended for large files).
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)
@@ -523,7 +523,7 @@ def convert_vcf(
 @click.group(cls=NaturalOrderGroup, name="vcf2zarr")
 def vcf2zarr_main():
     """
-    Convert VCF file(s) to the vcfzarr format.
+    Convert VCF file(s) to VCF Zarr format.
 
     See the online documentation at https://sgkit-dev.github.io/bio2zarr/
     for more information.
@@ -561,7 +561,9 @@ def convert_plink(
     samples_chunk_size,
 ):
     """
-    In development; DO NOT USE!
+    Convert plink fileset to VCF Zarr. Results are equivalent to
+    `plink1.9 --bfile prefix --keep-allele-order --recode vcf-iid --out tmp`
+    then running `vcf2zarr convert tmp.vcf zarr_path`
     """
     setup_logging(verbose)
     plink.convert(
@@ -577,6 +579,9 @@ def convert_plink(
 @version
 @click.group()
 def plink2zarr():
+    """
+    Convert plink fileset(s) to VCF Zarr format
+    """
     pass
 
 
@@ -676,6 +681,9 @@ def convert_tskit(
 @version
 @click.group()
 def tskit2zarr():
+    """
+    Convert tskit tree sequence(s) to VCF Zarr format
+    """
     pass
 
 

diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -1,25 +1,46 @@
+import dataclasses
 import logging
 import pathlib
 
 import numpy as np
-import zarr
 
 from bio2zarr import constants, core, vcz
 
 logger = logging.getLogger(__name__)
 
 
+@dataclasses.dataclass
+class PlinkPaths:
+    bed_path: str
+    bim_path: str
+    fam_path: str
+
+
 class PlinkFormat(vcz.Source):
     @core.requires_optional_dependency("bed_reader", "plink")
-    def __init__(self, path):
+    def __init__(self, prefix):
         import bed_reader
 
-        self._path = pathlib.Path(path)
-        self.bed = bed_reader.open_bed(path, num_threads=1, count_A1=False)
+        # TODO we will need support multiple chromosomes here to join
+        # plinks into on big zarr. So, these will require multiple
+        # bed and bim files, but should share a .fam
+        self.prefix = str(prefix)
+        paths = PlinkPaths(
+            self.prefix + ".bed",
+            self.prefix + ".bim",
+            self.prefix + ".fam",
+        )
+        self.bed = bed_reader.open_bed(
+            paths.bed_path,
+            bim_location=paths.bim_path,
+            fam_location=paths.fam_path,
+            num_threads=1,
+            count_A1=True,
+        )
 
     @property
     def path(self):
-        return self._path
+        return self.prefix
 
     @property
     def num_records(self):
@@ -46,9 +67,12 @@ def iter_field(self, field_name, shape, start, stop):
         assert field_name == "position"  # Only position field is supported from plink
         yield from self.bed.bp_position[start:stop]
 
+    def iter_id(self, start, stop):
+        yield from self.bed.sid[start:stop]
+
     def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
-        ref_field = self.bed.allele_1
-        alt_field = self.bed.allele_2
+        alt_field = self.bed.allele_1
+        ref_field = self.bed.allele_2
         bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T
         gt = np.zeros(shape, dtype=np.int8)
         phased = np.zeros(shape[:-1], dtype=bool)
@@ -61,9 +85,10 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
             gt[:] = 0
             gt[bed_chunk[i] == -127] = -1
             gt[bed_chunk[i] == 2] = 1
-            gt[bed_chunk[i] == 1, 0] = 1
+            gt[bed_chunk[i] == 1, 1] = 1
 
-            yield vcz.VariantData(max(len(a) for a in alleles), alleles, gt, phased)
+            # rlen is the length of the REF in PLINK as there's no END annotations
+            yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
 
     def generate_schema(
         self,
@@ -92,6 +117,9 @@ def generate_schema(
             f"variants={dimensions['variants'].chunk_size}, "
             f"samples={dimensions['samples'].chunk_size}"
         )
+        # If we don't have SVLEN or END annotations, the rlen field is defined
+        # as the length of the REF
+        max_len = self.bed.allele_2.itemsize
 
         array_specs = [
             vcz.ZarrArraySpec(
@@ -107,10 +135,22 @@ def generate_schema(
                 dimensions=["variants", "alleles"],
                 description=None,
             ),
+            vcz.ZarrArraySpec(
+                name="variant_id",
+                dtype="O",
+                dimensions=["variants"],
+                description=None,
+            ),
+            vcz.ZarrArraySpec(
+                name="variant_id_mask",
+                dtype="bool",
+                dimensions=["variants"],
+                description=None,
+            ),
             vcz.ZarrArraySpec(
                 source=None,
                 name="variant_length",
-                dtype="i4",
+                dtype=core.min_int_dtype(0, max_len),
                 dimensions=["variants"],
                 description="Length of each variant",
             ),
@@ -147,20 +187,20 @@ def generate_schema(
 
 
 def convert(
-    bed_path,
-    zarr_path,
+    prefix,
+    out,
     *,
     variants_chunk_size=None,
     samples_chunk_size=None,
     worker_processes=1,
     show_progress=False,
 ):
-    plink_format = PlinkFormat(bed_path)
+    plink_format = PlinkFormat(prefix)
     schema_instance = plink_format.generate_schema(
         variants_chunk_size=variants_chunk_size,
         samples_chunk_size=samples_chunk_size,
     )
-    zarr_path = pathlib.Path(zarr_path)
+    zarr_path = pathlib.Path(out)
     vzw = vcz.VcfZarrWriter(PlinkFormat, zarr_path)
     # Rough heuristic to split work up enough to keep utilisation high
     target_num_partitions = max(1, worker_processes * 4)
@@ -175,39 +215,3 @@ def convert(
     )
     vzw.finalise(show_progress)
     vzw.create_index()
-
-
-# FIXME do this more efficiently - currently reading the whole thing
-# in for convenience, and also comparing call-by-call
-@core.requires_optional_dependency("bed_reader", "plink")
-def validate(bed_path, zarr_path):
-    import bed_reader
-
-    root = zarr.open(store=zarr_path, mode="r")
-    call_genotype = root["call_genotype"][:]
-
-    bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
-
-    assert call_genotype.shape[0] == bed.sid_count
-    assert call_genotype.shape[1] == bed.iid_count
-    bed_genotypes = bed.read(dtype="int8").T
-    assert call_genotype.shape[0] == bed_genotypes.shape[0]
-    assert call_genotype.shape[1] == bed_genotypes.shape[1]
-    assert call_genotype.shape[2] == 2
-
-    row_id = 0
-    for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
-        # print("ROW", row_id)
-        # print(bed_row, zarr_row)
-        row_id += 1
-        for bed_call, zarr_call in zip(bed_row, zarr_row):
-            if bed_call == -127:
-                assert list(zarr_call) == [-1, -1]
-            elif bed_call == 0:
-                assert list(zarr_call) == [0, 0]
-            elif bed_call == 1:
-                assert list(zarr_call) == [1, 0]
-            elif bed_call == 2:
-                assert list(zarr_call) == [1, 1]
-            else:  # pragma no cover
-                raise AssertionError(f"Unexpected bed call {bed_call}")
diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -6,6 +6,9 @@ chapters:
   sections:
   - file: vcf2zarr/tutorial
   - file: vcf2zarr/cli_ref
+- file: plink2zarr/overview
+  sections:
+  - file: plink2zarr/cli_ref
 - file: vcfpartition/overview
   sections:
   - file: vcfpartition/cli_ref
diff --git a/docs/intro.md b/docs/intro.md
@@ -8,6 +8,9 @@
 - {ref}`sec-vcf2zarr` converts VCF data to
   [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format.
 
+- {ref}`sec-plink2zarr` converts PLINK 1.0 data to
+  [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format.
+
 - {ref}`sec-vcfpartition` is a utility to split an input
   VCF into a given number of partitions. This is useful for
   parallel processing of VCF data.
@@ -17,10 +20,8 @@
 `bio2zarr` is in development, contributions, feedback and issues are welcome
 at the [GitHub repository](https://github.com/sgkit-dev/bio2zarr).
 
-Support for converting PLINK data to VCF Zarr is partially implemented,
-and adding BGEN and [tskit](https://tskit.dev/) support is also planned.
 If you would like to see
-support for other formats (or an interested in helping with implementing),
+support for other formats such as BGEN (or an interested in helping with implementing),
 please open an [issue on Github](https://github.com/sgkit-dev/bio2zarr/issues)
 to discuss!
 

diff --git a/docs/plink2zarr/cli_ref.md b/docs/plink2zarr/cli_ref.md
@@ -0,0 +1,17 @@
+(sec-plink2zarr-cli-ref)=
+# CLI Reference
+
+% A note on cross references... There's some weird long-standing problem with
+% cross referencing program values in Sphinx, which means that we can't use
+% the built-in labels generated by sphinx-click. We can make our own explicit
+% targets, but these have to have slightly weird names to avoid conflicting
+% with what sphinx-click is doing. So, hence the cmd- prefix.
+% Based on: https://github.com/skypilot-org/skypilot/pull/2834
+
+```{eval-rst}
+
+.. _cmd-plink2zarr-convert:
+.. click:: bio2zarr.cli:convert_plink
+   :prog: plink2zarr convert
+   :nested: full
+
diff --git a/docs/plink2zarr/overview.md b/docs/plink2zarr/overview.md
@@ -0,0 +1,38 @@
+(sec-plink2zarr)=
+# plink2zarr
+
+Convert plink data to the
+[VCF Zarr specification](https://github.com/sgkit-dev/vcf-zarr-spec/)
+reliably in parallel.
+
+See {ref}`sec-plink2zarr-cli-ref` for detailed documentation on
+command line options.
+
+Conversion of the plink data model to VCF follows the semantics of plink1.9 as closely
+as possible. That is, given a binary plink fileset with prefix "fileset" (i.e.,
+fileset.bed, fileset.bim, fileset.fam), running
+```
+$ plink2zarr convert fileset out.vcz
+```
+should produce the same result in ``out.vcz`` as
+```
+$ plink1.9 --bfile fileset --keep-allele-order --recode vcf-iid --out tmp
+$ vcf2zarr convert tmp.vcf out.vcz
+```
+
+:::{warning}
+It is important to note that we follow the same conventions as plink 2.0
+where the A1 allele in the [bim file](https://www.cog-genomics.org/plink/2.0/formats#bim)
+is the VCF ALT and A2 is the REF.
+:::
+
+:::{note}
+Currently we only convert the basic VCF-like data from plink, and don't include
+phenotypes and pedigree information. These are planned as future enhancements.
+Please comment on [this issue](https://github.com/sgkit-dev/bio2zarr/issues/392)
+if you are interested in this functionality.
+:::
+
+
+
+
diff --git a/tests/data/plink/example.vcf b/tests/data/plink/example.vcf
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.2
+##fileDate=20250521
+##source=PLINKv1.90
+##contig=<ID=1,length=21>
+##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional reference allele, may not be based on real reference genome">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	ind0	ind1	ind2	ind3	ind4	ind5	ind6	ind7	ind8	ind9
+1	10	1_10	GG	A	.	.	PR	GT	1/1	1/1	1/1	0/0	0/0	0/0	0/0	0/0	0/0	0/0
+1	20	1_20	C	TTT	.	.	PR	GT	1/1	1/1	1/1	1/1	0/0	0/0	0/0	0/0	0/0	0/0
diff --git a/tests/data/plink/example_with_fam.vcf b/tests/data/plink/example_with_fam.vcf
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.2
+##fileDate=20250521
+##source=PLINKv1.90
+##contig=<ID=1,length=21>
+##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional reference allele, may not be based on real reference genome">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	ind0	ind1	ind2	ind3	ind4	ind5	ind6	ind7	ind8	ind9
+1	10	1_10	G	A	.	.	PR	GT	1/1	1/1	1/1	0/0	0/0	0/0	0/0	0/0	0/0	0/0
+1	20	1_20	C	T	.	.	PR	GT	1/1	1/1	1/1	1/1	0/0	0/0	0/0	0/0	0/0	0/0