From 117c5f899b348779b54fea70047994b2603f1eab Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Wed, 28 May 2025 13:47:00 +0100
Subject: [PATCH 1/2] Fixup and simplify plink validation tests

---
 tests/test_plink_validation.py | 27 +++++++++++++--------------
 vcztools/cli.py                |  2 +-
 vcztools/plink.py              | 25 +++++++++++--------------
 3 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/tests/test_plink_validation.py b/tests/test_plink_validation.py
index 8792f19..46f1b39 100644
--- a/tests/test_plink_validation.py
+++ b/tests/test_plink_validation.py
@@ -2,10 +2,9 @@
 import pathlib
 import subprocess
 
-import click.testing as ct
 import pytest
 
-import vcztools.cli as cli
+from vcztools.plink import write_plink
 
 from . import utils
 
@@ -21,7 +20,6 @@ def assert_files_identical(path1, path2):
     assert b1 == b2
 
 
-@pytest.mark.skip("Removing plink from CLI for bugfix release")
 # fmt: off
 @pytest.mark.parametrize(
     ("args", "vcf_file"),
@@ -35,20 +33,21 @@ def assert_files_identical(path1, path2):
 )
 # fmt: on
 def test_conversion_identical(tmp_path, args, vcf_file):
+    tmp_path = pathlib.Path("tmp/plink")
+
     original = pathlib.Path("tests/data/vcf") / vcf_file
     vcz = utils.vcz_path_cache(original)
 
-    plink_workdir = tmp_path / "plink1.9"
-    plink_workdir.mkdir()
+    plink_prefix = str(tmp_path / "plink")
     plink_bin = os.environ.get("PLINK_BIN", "plink")
-    cmd = f"{plink_bin} --vcf {original.absolute()} {args}"
-    result = subprocess.run(cmd, shell=True, cwd=plink_workdir, capture_output=True)
+    cmd = f"{plink_bin} --out {plink_prefix} --vcf {original.absolute()} {args}"
+    result = subprocess.run(cmd, shell=True, capture_output=True)
     assert result.returncode == 0
 
-    cmd = f"view-plink1 {vcz.absolute()} {args}"
-    runner = ct.CliRunner()
-    with runner.isolated_filesystem(tmp_path) as working_dir:
-        vcz_workdir = pathlib.Path(working_dir)
-        result = runner.invoke(cli.vcztools_main, cmd, catch_exceptions=False)
-        for filename in ["plink.fam", "plink.bim", "plink.bed"]:
-            assert_files_identical(vcz_workdir / filename, plink_workdir / filename)
+    vcztools_prefix = str(tmp_path / "vcztools")
+    write_plink(vcz, vcztools_prefix)
+
+    for suffix in [".bed", ".fam", ".bim"]:
+        plink_file = plink_prefix + suffix
+        vcztools_file = vcztools_prefix + suffix
+        assert_files_identical(plink_file, vcztools_file)
diff --git a/vcztools/cli.py b/vcztools/cli.py
index b84b947..a0f8890 100644
--- a/vcztools/cli.py
+++ b/vcztools/cli.py
@@ -304,4 +304,4 @@ def vcztools_main():
 vcztools_main.add_command(index)
 vcztools_main.add_command(query)
 vcztools_main.add_command(view)
-# vcztools_main.add_command(view_plink1)
+vcztools_main.add_command(view_plink1)
diff --git a/vcztools/plink.py b/vcztools/plink.py
index 7951661..c90c9cf 100644
--- a/vcztools/plink.py
+++ b/vcztools/plink.py
@@ -2,8 +2,6 @@
 Convert VCZ to plink 1 binary format.
 """
 
-import pathlib
-
 import numpy as np
 import pandas as pd
 import zarr
@@ -83,12 +81,12 @@ def _compute_alleles(self, G, alleles):
         Returns the a12 alleles for the specified chunk of data.
         """
         max_alleles = alleles.shape[1]
-        if max_alleles != 2:
-            raise ValueError(
-                "Only biallelic VCFs supported currently: "
-                "please comment on https://github.com/sgkit-dev/vcztools/issues/224 "
-                "if this limitation affects you"
-            )
+        # if max_alleles != 2:
+        #     raise ValueError(
+        #         "Only biallelic VCFs supported currently: "
+        #         "please comment on https://github.com/sgkit-dev/vcztools/issues/224 "
+        #         "if this limitation affects you"
+        #     )
         num_variants = G.shape[0]
         num_samples = G.shape[1]
         a12_allele = np.zeros((num_variants, 2), dtype=int) - 1
@@ -155,14 +153,13 @@ def run(self):
             f.write(generate_fam(self.root))
 
 
-def write_plink(vcz_path, out, include=None, exclude=None):
-    out_prefix = pathlib.Path(out)
-    # out_prefix.mkdir(exist_ok=True)
+def write_plink(vcz_path, out_prefix, include=None, exclude=None):
+    out_prefix = str(out_prefix)
     writer = Writer(
         vcz_path,
-        bed_path=out_prefix.with_suffix(".bed"),
-        fam_path=out_prefix.with_suffix(".fam"),
-        bim_path=out_prefix.with_suffix(".bim"),
+        bed_path=out_prefix + ".bed",
+        fam_path=out_prefix + ".fam",
+        bim_path=out_prefix + ".bim",
         include=include,
         exclude=exclude,
     )

From 4c222df0bcb0d9b26bd6c3fe8ca2bc19aa3d62f1 Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Wed, 28 May 2025 20:10:48 +0100
Subject: [PATCH 2/2] Various updates for plink

Incomplete
---
 tests/test_plink.py            |  20 +++++
 tests/test_plink_validation.py |  11 ++-
 tests/test_tskit_data.py       |  54 +++++++++----
 vcztools/plink.py              | 142 ++++++++++++++++++++++++++++-----
 4 files changed, 190 insertions(+), 37 deletions(-)

diff --git a/tests/test_plink.py b/tests/test_plink.py
index 36916ac..531ac34 100644
--- a/tests/test_plink.py
+++ b/tests/test_plink.py
@@ -1,4 +1,5 @@
 import numpy as np
+import numpy.testing as nt
 import pytest
 
 from vcztools import plink
@@ -162,3 +163,22 @@ def test_nonsensical_data(self, num_variants, num_samples):
         b1 = encode_genotypes(g, a12)
         b2 = plink.encode_genotypes(g, a12)
         assert b1 == b2
+
+
+class TestReorderAlleles:
+    @pytest.mark.parametrize(
+        ["genotypes", "minor", "major", "reordered"],
+        [
+            ([[0, 0], [0, 1]], 1, 0, [[0, 0], [0, 1]]),
+            ([[1, 1], [1, 0]], 0, 1, [[0, 0], [0, 1]]),
+            ([[2, 2], [2, 1]], 1, 2, [[0, 0], [0, 1]]),
+            ([[2, 2], [2, 0]], 0, 2, [[0, 0], [0, 1]]),
+            ([[0, 0], [-1, 1]], 1, 0, [[0, 0], [-1, 1]]),
+            ([[0, 0], [-2, 1]], 1, 0, [[0, 0], [-2, 1]]),
+        ],
+    )
+    def test_examples(self, genotypes, minor, major, reordered):
+        result = plink.reorder_alleles(np.array(genotypes))
+        assert result[0] == minor
+        assert result[1] == major
+        nt.assert_array_equal(reordered, result[2])
diff --git a/tests/test_plink_validation.py b/tests/test_plink_validation.py
index 46f1b39..8b14b5d 100644
--- a/tests/test_plink_validation.py
+++ b/tests/test_plink_validation.py
@@ -9,13 +9,14 @@
 from . import utils
 
 
-def assert_files_identical(path1, path2):
+def assert_files_identical(path1, path2, binary=False):
     """
     Asserts the files are byte-for-byte identical.
     """
-    with open(path1, "rb") as f:
+    mode = "rb" if binary else "r"
+    with open(path1, mode) as f:
         b1 = f.read()
-    with open(path2, "rb") as f:
+    with open(path2, mode) as f:
         b2 = f.read()
     assert b1 == b2
 
@@ -47,7 +48,9 @@ def test_conversion_identical(tmp_path, args, vcf_file):
     vcztools_prefix = str(tmp_path / "vcztools")
     write_plink(vcz, vcztools_prefix)
 
-    for suffix in [".bed", ".fam", ".bim"]:
+    assert_files_identical(plink_prefix + ".bed", vcztools_prefix + ".bed", binary=True)
+
+    for suffix in [".fam", ".bim"]:
         plink_file = plink_prefix + suffix
         vcztools_file = vcztools_prefix + suffix
         assert_files_identical(plink_file, vcztools_file)
diff --git a/tests/test_tskit_data.py b/tests/test_tskit_data.py
index e6f3e2d..5568afa 100644
--- a/tests/test_tskit_data.py
+++ b/tests/test_tskit_data.py
@@ -99,7 +99,7 @@ def fx_diploid_missing_data(tmp_path):
     tables.sites.add_row(2, ancestral_state="A")
     tables.sites.add_row(9, ancestral_state="T")
     tables.mutations.add_row(site=0, node=0, derived_state="G")
-    tables.mutations.add_row(site=1, node=5, derived_state="C")
+    tables.mutations.add_row(site=1, node=3, derived_state="C")
     zarr_path = tmp_path / "sim.vcz"
     ts = tables.tree_sequence()
     model_map = ts.map_to_vcf_model(ploidy=2)
@@ -113,7 +113,7 @@ def test_diploid_missing_data(fx_diploid_missing_data):
         ds.call_genotype.values,
         [
             [[1, 0], [0, 0], [-1, -1]],
-            [[0, 0], [1, 1], [-1, -1]],
+            [[0, 0], [0, 1], [-1, -1]],
         ],
     )
 
@@ -126,18 +126,24 @@ def fx_diploid_multi_allelic(tmp_path):
     #     ┊ ┏┻┓ ┏┻┓ ┊
     # 0.00┊ 0 1 2 3 ┊
     #     0         10
-    #      |       |
-    #  pos 2       9
-    #  anc A       T
+    #      |  |    |
+    #  pos 2  4    9
+    #  anc A  G    T
     ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence
     tables = ts.dump_tables()
     tables.sites.add_row(2, ancestral_state="A")
+    tables.sites.add_row(4, ancestral_state="G")
     tables.sites.add_row(9, ancestral_state="T")
     tables.mutations.add_row(site=0, node=0, derived_state="G")
-    tables.mutations.add_row(site=1, node=1, derived_state="G")
-    tables.mutations.add_row(site=1, node=5, derived_state="C")
+    # Two mutations making the ancestral state the minor allele.
+    tables.mutations.add_row(site=1, node=4, derived_state="T")
+    tables.mutations.add_row(site=1, node=2, derived_state="T")
+    tables.mutations.add_row(site=2, node=1, derived_state="G")
+    tables.mutations.add_row(site=2, node=5, derived_state="C")
     zarr_path = tmp_path / "sim.vcz"
     ts = tables.tree_sequence()
+    with open("diploid_multi_allelic.vcf", "w") as f:
+        ts.write_vcf(f, ploidy=2, individual_names=["s0", "s1"])
     model_map = ts.map_to_vcf_model(ploidy=2)
     ts2z.convert(ts, zarr_path, model_mapping=model_map)
     return zarr_path
@@ -146,12 +152,15 @@ def fx_diploid_multi_allelic(tmp_path):
 def test_diploid_multi_allelic(fx_diploid_multi_allelic):
     ds = sg.load_dataset(fx_diploid_multi_allelic)
     # NOTE this example is constructed so that the rarest allele is in the middle
-    # of the alleles array
-    nt.assert_array_equal(ds.variant_allele.values, [["A", "G", ""], ["T", "G", "C"]])
+    # of the alleles array, and it's the first in the third.
+    nt.assert_array_equal(
+        ds.variant_allele.values, [["A", "G", ""], ["G", "T", ""], ["T", "G", "C"]]
+    )
     nt.assert_array_equal(
         ds.call_genotype.values,
         [
             [[1, 0], [0, 0]],
+            [[1, 1], [1, 0]],
             [[0, 1], [2, 2]],
         ],
     )
@@ -263,15 +272,17 @@ def recode_plink_hets(G):
 
 
 class TestPlinkRoundTrip:
-    def assert_bio2zarr_rt(self, tmp_path, tskit_vcz):
-        # import pathlib
-        # tmp_path = pathlib.Path("tmp/plink")
+    def run_bio2zarr_rt(self, tmp_path, tskit_vcz):
         plink_path = tmp_path / "plink"
         write_plink(tskit_vcz, plink_path)
         rt_vcz_path = tmp_path / "rt.vcz"
         p2z.convert(plink_path, rt_vcz_path)
         ds1 = sg.load_dataset(tskit_vcz)
         ds2 = sg.load_dataset(rt_vcz_path)
+        return ds1, ds2
+
+    def assert_bio2zarr_rt(self, tmp_path, tskit_vcz):
+        ds1, ds2 = self.run_bio2zarr_rt(tmp_path, tskit_vcz)
 
         assert np.all(ds1["call_genotype_phased"])
         assert np.all(~ds2["call_genotype_phased"])
@@ -279,7 +290,6 @@ def assert_bio2zarr_rt(self, tmp_path, tskit_vcz):
         nt.assert_array_equal(
             recode_plink_hets(ds1["call_genotype"].values), ds2["call_genotype"]
         )
-
         drop_fields = [
             "variant_id",
             "variant_id_mask",
@@ -297,6 +307,20 @@ def test_diploid_msprime_sim(self, tmp_path, fx_diploid_msprime_sim):
     def test_diploid_missing_data(self, tmp_path, fx_diploid_missing_data):
         self.assert_bio2zarr_rt(tmp_path, fx_diploid_missing_data)
 
+    @pytest.mark.skip("Need a different example")
     def test_diploid_multi_allelic(self, tmp_path, fx_diploid_multi_allelic):
-        with pytest.raises(ValueError, match="Only biallelic VCFs supported"):
-            self.assert_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic)
+        _, ds = self.run_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic)
+
+        print(ds.call_genotype.values)
+
+        nt.assert_array_equal(
+            ds.variant_allele.values, [["A", "G"], ["G", "T"], ["T", "C"]]
+        )
+        nt.assert_array_equal(
+            ds.call_genotype.values,
+            [
+                [[0, 1], [0, 0]],
+                [[1, 1], [0, 1]],
+                [[-1, -1], [1, 1]],
+            ],
+        )
diff --git a/vcztools/plink.py b/vcztools/plink.py
index c90c9cf..3f15c1c 100644
--- a/vcztools/plink.py
+++ b/vcztools/plink.py
@@ -20,6 +20,12 @@ def encode_genotypes(genotypes, a12_allele=None):
     return bytes(_vcztools.encode_plink(G, a12_allele).data)
 
 
+def new_encode_genotypes(genotypes):
+    G = np.asarray([genotypes], dtype=np.int8)
+    a12_allele = np.asarray([[1, 0]], dtype=G.dtype)
+    return bytes(_vcztools.encode_plink(G, a12_allele).data)
+
+
 def generate_fam(root):
     # TODO generate an error if sample_id contains a space
     sample_id = root["sample_id"][:].astype(str)
@@ -66,6 +72,82 @@ def generate_bim(root, a12_allele):
     return df.to_csv(header=False, sep="\t", index=False)
 
 
+def translate(genotypes, alleles):
+    copy = np.full_like(genotypes, -1)
+    for new, old in enumerate(alleles):
+        copy[genotypes == old] = new
+    return copy
+
+
+def reorder_alleles(genotypes):
+    """
+    Return a tuple (minor, major, reordered) for the specified numpy array of diploid
+    genotypes for a given variant. The reordered genotypes will be coded such that
+    0 is the major allele and 1 is the minor allele. The returned values of minor
+    and major are the respective alleles in the *input* genotypes.
+    """
+    num_samples = genotypes.shape[0]
+    assert genotypes.shape[1] == 2
+    g = genotypes.reshape(num_samples * 2)
+    assert np.all(g >= -2)
+    max_alleles = np.max(g)
+    count = np.bincount(g + 2, minlength=max_alleles + 2)
+    # [dimension pad, missing data, alleles[0], alleles[1], ...]
+    count = count[2:]
+    if max_alleles == 1 and count[0] > count[1]:
+        # Common case - exit early with nothing to do
+        return 1, 0, genotypes
+    # General case
+    argsort = np.argsort(count)
+
+    # a12_allele[j, 1] = 0
+    major = 0
+    if argsort[-1] == 0:
+        # print("Ref allele most frequent")
+        # Ref allele is most frequent - chose lowest allele from next most
+        # frequent class
+        f = count[argsort[-2]]
+    else:
+        # print("Ref allele not most frequent")
+        f = count[argsort[-1]]
+    a = 1
+    while count[a] != f:
+        a += 1
+    # a12_allele[j, 0] = a
+    minor = a
+
+    # assert a12_allele[j, 0] != a12_allele[j, 1]
+    # if alleles[j][1] == "":
+    #     a12_allele[j, 0] = -1
+
+    # if argsort[-1] == 0:
+    #     # print("Ref allele most frequent")
+    #     # Ref allele is most frequent - chose lowest allele from next most
+    #     # frequent class
+    #     f = count[argsort[-2]]
+    # else:
+    #     # print("Ref allele not most frequent")
+    #     f = count[argsort[-1]]
+    # a = 1
+    # while count[a] != f:
+    #     a += 1
+    # minor = a
+
+    # a12_allele[j, 0] = a
+    # assert a12_allele[j, 0] != a12_allele[j, 1]
+    # if alleles[j][1] == "":
+    #     a12_allele[j, 0] = -1
+
+    # print("count = ", count, argsort)
+    # major = argsort[-1]
+    # minor = -1
+    # if len(argsort) > 1:
+    #     minor = argsort[-2]
+    #     while minor >= 0 and count[minor] == count[argsort[-2]]:
+    #         minor -= 1
+    return minor, major, translate(genotypes, [major, minor])
+
+
 class Writer:
     def __init__(
         self, vcz_path, bed_path, fam_path, bim_path, include=None, exclude=None
@@ -90,7 +172,7 @@ def _compute_alleles(self, G, alleles):
         num_variants = G.shape[0]
         num_samples = G.shape[1]
         a12_allele = np.zeros((num_variants, 2), dtype=int) - 1
-        for j, g in enumerate(G):
+        for j in range(num_variants):
             g = g.reshape(num_samples * 2)
             assert np.all(g >= -2)
             count = np.bincount(g + 2, minlength=max_alleles + 2)
@@ -124,30 +206,54 @@ def _compute_alleles(self, G, alleles):
             # )
         return a12_allele
 
-    def _write_genotypes(self):
+    def run(self):
         ci = retrieval.variant_chunk_iter(
-            self.root, fields=["call_genotype", "variant_allele"]
-        )
-        call_genotype = self.root["call_genotype"]
-        a12_allele = zarr.zeros(
-            (call_genotype.shape[0], 2), chunks=call_genotype.chunks[0], dtype=int
+            self.root,
+            fields=[
+                "call_genotype",
+                "variant_allele",
+                "variant_contig",
+                "variant_position",
+                "variant_id",
+            ],
         )
+        contig_id = self.root["contig_id"][:].astype(str)
+        bim_rows = []
         with open(self.bed_path, "wb") as bed_file:
             bed_file.write(bytes([0x6C, 0x1B, 0x01]))
 
-            for j, chunk in enumerate(ci):
-                G = chunk["call_genotype"]
-                a12 = self._compute_alleles(G, chunk["variant_allele"])
-                buff = encode_genotypes(G, a12)
-                bed_file.write(buff)
-                a12_allele.blocks[j] = a12
-        return a12_allele[:]
-
-    def run(self):
-        a12_allele = self._write_genotypes()
+            for chunk in ci:
+                iterator = zip(
+                    chunk["variant_contig"],
+                    chunk["variant_id"],
+                    chunk["variant_position"],
+                    chunk["variant_allele"],
+                    chunk["call_genotype"],
+                )
+                for contig, variant_id, position, alleles, genotypes in iterator:
+                    print("VAR", position, alleles, genotypes)
+                    minor, major, genotypes = reorder_alleles(genotypes)
+                    print("mapped:", minor, major, genotypes)
+                    allele_1 = "0"
+                    if minor != -1:
+                        allele_1 = alleles[minor]
+                    allele_2 = alleles[major]
+                    buff = new_encode_genotypes(genotypes)
+                    bed_file.write(buff)
+                    bim_rows.append(
+                        {
+                            "Contig": contig_id[contig],
+                            "VariantId": variant_id,
+                            "GeneticPosition": 0,
+                            "Position": position,
+                            "Allele1": allele_1,
+                            "Allele2": allele_2,
+                        }
+                    )
 
+        bim_df = pd.DataFrame(bim_rows)
         with open(self.bim_path, "w") as f:
-            f.write(generate_bim(self.root, a12_allele))
+            f.write(bim_df.to_csv(header=False, sep="\t", index=False))
 
         with open(self.fam_path, "w") as f:
             f.write(generate_fam(self.root))