From 117c5f899b348779b54fea70047994b2603f1eab Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 28 May 2025 13:47:00 +0100 Subject: [PATCH 1/2] Fixup and simplify plink validation tests --- tests/test_plink_validation.py | 27 +++++++++++++-------------- vcztools/cli.py | 2 +- vcztools/plink.py | 25 +++++++++++-------------- 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/tests/test_plink_validation.py b/tests/test_plink_validation.py index 8792f19..46f1b39 100644 --- a/tests/test_plink_validation.py +++ b/tests/test_plink_validation.py @@ -2,10 +2,9 @@ import pathlib import subprocess -import click.testing as ct import pytest -import vcztools.cli as cli +from vcztools.plink import write_plink from . import utils @@ -21,7 +20,6 @@ def assert_files_identical(path1, path2): assert b1 == b2 -@pytest.mark.skip("Removing plink from CLI for bugfix release") # fmt: off @pytest.mark.parametrize( ("args", "vcf_file"), @@ -35,20 +33,21 @@ def assert_files_identical(path1, path2): ) # fmt: on def test_conversion_identical(tmp_path, args, vcf_file): + tmp_path = pathlib.Path("tmp/plink") + original = pathlib.Path("tests/data/vcf") / vcf_file vcz = utils.vcz_path_cache(original) - plink_workdir = tmp_path / "plink1.9" - plink_workdir.mkdir() + plink_prefix = str(tmp_path / "plink") plink_bin = os.environ.get("PLINK_BIN", "plink") - cmd = f"{plink_bin} --vcf {original.absolute()} {args}" - result = subprocess.run(cmd, shell=True, cwd=plink_workdir, capture_output=True) + cmd = f"{plink_bin} --out {plink_prefix} --vcf {original.absolute()} {args}" + result = subprocess.run(cmd, shell=True, capture_output=True) assert result.returncode == 0 - cmd = f"view-plink1 {vcz.absolute()} {args}" - runner = ct.CliRunner() - with runner.isolated_filesystem(tmp_path) as working_dir: - vcz_workdir = pathlib.Path(working_dir) - result = runner.invoke(cli.vcztools_main, cmd, catch_exceptions=False) - for filename in ["plink.fam", "plink.bim", "plink.bed"]: - assert_files_identical(vcz_workdir / filename, plink_workdir / filename) + vcztools_prefix = str(tmp_path / "vcztools") + write_plink(vcz, vcztools_prefix) + + for suffix in [".bed", ".fam", ".bim"]: + plink_file = plink_prefix + suffix + vcztools_file = vcztools_prefix + suffix + assert_files_identical(plink_file, vcztools_file) diff --git a/vcztools/cli.py b/vcztools/cli.py index b84b947..a0f8890 100644 --- a/vcztools/cli.py +++ b/vcztools/cli.py @@ -304,4 +304,4 @@ def vcztools_main(): vcztools_main.add_command(index) vcztools_main.add_command(query) vcztools_main.add_command(view) -# vcztools_main.add_command(view_plink1) +vcztools_main.add_command(view_plink1) diff --git a/vcztools/plink.py b/vcztools/plink.py index 7951661..c90c9cf 100644 --- a/vcztools/plink.py +++ b/vcztools/plink.py @@ -2,8 +2,6 @@ Convert VCZ to plink 1 binary format. """ -import pathlib - import numpy as np import pandas as pd import zarr @@ -83,12 +81,12 @@ def _compute_alleles(self, G, alleles): Returns the a12 alleles for the specified chunk of data. """ max_alleles = alleles.shape[1] - if max_alleles != 2: - raise ValueError( - "Only biallelic VCFs supported currently: " - "please comment on https://github.com/sgkit-dev/vcztools/issues/224 " - "if this limitation affects you" - ) + # if max_alleles != 2: + # raise ValueError( + # "Only biallelic VCFs supported currently: " + # "please comment on https://github.com/sgkit-dev/vcztools/issues/224 " + # "if this limitation affects you" + # ) num_variants = G.shape[0] num_samples = G.shape[1] a12_allele = np.zeros((num_variants, 2), dtype=int) - 1 @@ -155,14 +153,13 @@ def run(self): f.write(generate_fam(self.root)) -def write_plink(vcz_path, out, include=None, exclude=None): - out_prefix = pathlib.Path(out) - # out_prefix.mkdir(exist_ok=True) +def write_plink(vcz_path, out_prefix, include=None, exclude=None): + out_prefix = str(out_prefix) writer = Writer( vcz_path, - bed_path=out_prefix.with_suffix(".bed"), - fam_path=out_prefix.with_suffix(".fam"), - bim_path=out_prefix.with_suffix(".bim"), + bed_path=out_prefix + ".bed", + fam_path=out_prefix + ".fam", + bim_path=out_prefix + ".bim", include=include, exclude=exclude, ) From 4c222df0bcb0d9b26bd6c3fe8ca2bc19aa3d62f1 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 28 May 2025 20:10:48 +0100 Subject: [PATCH 2/2] Various updates for plink Incomplete --- tests/test_plink.py | 20 +++++ tests/test_plink_validation.py | 11 ++- tests/test_tskit_data.py | 54 +++++++++---- vcztools/plink.py | 142 ++++++++++++++++++++++++++++----- 4 files changed, 190 insertions(+), 37 deletions(-) diff --git a/tests/test_plink.py b/tests/test_plink.py index 36916ac..531ac34 100644 --- a/tests/test_plink.py +++ b/tests/test_plink.py @@ -1,4 +1,5 @@ import numpy as np +import numpy.testing as nt import pytest from vcztools import plink @@ -162,3 +163,22 @@ def test_nonsensical_data(self, num_variants, num_samples): b1 = encode_genotypes(g, a12) b2 = plink.encode_genotypes(g, a12) assert b1 == b2 + + +class TestReorderAlleles: + @pytest.mark.parametrize( + ["genotypes", "minor", "major", "reordered"], + [ + ([[0, 0], [0, 1]], 1, 0, [[0, 0], [0, 1]]), + ([[1, 1], [1, 0]], 0, 1, [[0, 0], [0, 1]]), + ([[2, 2], [2, 1]], 1, 2, [[0, 0], [0, 1]]), + ([[2, 2], [2, 0]], 0, 2, [[0, 0], [0, 1]]), + ([[0, 0], [-1, 1]], 1, 0, [[0, 0], [-1, 1]]), + ([[0, 0], [-2, 1]], 1, 0, [[0, 0], [-2, 1]]), + ], + ) + def test_examples(self, genotypes, minor, major, reordered): + result = plink.reorder_alleles(np.array(genotypes)) + assert result[0] == minor + assert result[1] == major + nt.assert_array_equal(reordered, result[2]) diff --git a/tests/test_plink_validation.py b/tests/test_plink_validation.py index 46f1b39..8b14b5d 100644 --- a/tests/test_plink_validation.py +++ b/tests/test_plink_validation.py @@ -9,13 +9,14 @@ from . import utils -def assert_files_identical(path1, path2): +def assert_files_identical(path1, path2, binary=False): """ Asserts the files are byte-for-byte identical. """ - with open(path1, "rb") as f: + mode = "rb" if binary else "r" + with open(path1, mode) as f: b1 = f.read() - with open(path2, "rb") as f: + with open(path2, mode) as f: b2 = f.read() assert b1 == b2 @@ -47,7 +48,9 @@ def test_conversion_identical(tmp_path, args, vcf_file): vcztools_prefix = str(tmp_path / "vcztools") write_plink(vcz, vcztools_prefix) - for suffix in [".bed", ".fam", ".bim"]: + assert_files_identical(plink_prefix + ".bed", vcztools_prefix + ".bed", binary=True) + + for suffix in [".fam", ".bim"]: plink_file = plink_prefix + suffix vcztools_file = vcztools_prefix + suffix assert_files_identical(plink_file, vcztools_file) diff --git a/tests/test_tskit_data.py b/tests/test_tskit_data.py index e6f3e2d..5568afa 100644 --- a/tests/test_tskit_data.py +++ b/tests/test_tskit_data.py @@ -99,7 +99,7 @@ def fx_diploid_missing_data(tmp_path): tables.sites.add_row(2, ancestral_state="A") tables.sites.add_row(9, ancestral_state="T") tables.mutations.add_row(site=0, node=0, derived_state="G") - tables.mutations.add_row(site=1, node=5, derived_state="C") + tables.mutations.add_row(site=1, node=3, derived_state="C") zarr_path = tmp_path / "sim.vcz" ts = tables.tree_sequence() model_map = ts.map_to_vcf_model(ploidy=2) @@ -113,7 +113,7 @@ def test_diploid_missing_data(fx_diploid_missing_data): ds.call_genotype.values, [ [[1, 0], [0, 0], [-1, -1]], - [[0, 0], [1, 1], [-1, -1]], + [[0, 0], [0, 1], [-1, -1]], ], ) @@ -126,18 +126,24 @@ def fx_diploid_multi_allelic(tmp_path): # ┊ ┏┻┓ ┏┻┓ ┊ # 0.00┊ 0 1 2 3 ┊ # 0 10 - # | | - # pos 2 9 - # anc A T + # | | | + # pos 2 4 9 + # anc A G T ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence tables = ts.dump_tables() tables.sites.add_row(2, ancestral_state="A") + tables.sites.add_row(4, ancestral_state="G") tables.sites.add_row(9, ancestral_state="T") tables.mutations.add_row(site=0, node=0, derived_state="G") - tables.mutations.add_row(site=1, node=1, derived_state="G") - tables.mutations.add_row(site=1, node=5, derived_state="C") + # Two mutations making the ancestral state the minor allele. + tables.mutations.add_row(site=1, node=4, derived_state="T") + tables.mutations.add_row(site=1, node=2, derived_state="T") + tables.mutations.add_row(site=2, node=1, derived_state="G") + tables.mutations.add_row(site=2, node=5, derived_state="C") zarr_path = tmp_path / "sim.vcz" ts = tables.tree_sequence() + with open("diploid_multi_allelic.vcf", "w") as f: + ts.write_vcf(f, ploidy=2, individual_names=["s0", "s1"]) model_map = ts.map_to_vcf_model(ploidy=2) ts2z.convert(ts, zarr_path, model_mapping=model_map) return zarr_path @@ -146,12 +152,15 @@ def fx_diploid_multi_allelic(tmp_path): def test_diploid_multi_allelic(fx_diploid_multi_allelic): ds = sg.load_dataset(fx_diploid_multi_allelic) # NOTE this example is constructed so that the rarest allele is in the middle - # of the alleles array - nt.assert_array_equal(ds.variant_allele.values, [["A", "G", ""], ["T", "G", "C"]]) + # of the alleles array, and it's the first in the third. + nt.assert_array_equal( + ds.variant_allele.values, [["A", "G", ""], ["G", "T", ""], ["T", "G", "C"]] + ) nt.assert_array_equal( ds.call_genotype.values, [ [[1, 0], [0, 0]], + [[1, 1], [1, 0]], [[0, 1], [2, 2]], ], ) @@ -263,15 +272,17 @@ def recode_plink_hets(G): class TestPlinkRoundTrip: - def assert_bio2zarr_rt(self, tmp_path, tskit_vcz): - # import pathlib - # tmp_path = pathlib.Path("tmp/plink") + def run_bio2zarr_rt(self, tmp_path, tskit_vcz): plink_path = tmp_path / "plink" write_plink(tskit_vcz, plink_path) rt_vcz_path = tmp_path / "rt.vcz" p2z.convert(plink_path, rt_vcz_path) ds1 = sg.load_dataset(tskit_vcz) ds2 = sg.load_dataset(rt_vcz_path) + return ds1, ds2 + + def assert_bio2zarr_rt(self, tmp_path, tskit_vcz): + ds1, ds2 = self.run_bio2zarr_rt(tmp_path, tskit_vcz) assert np.all(ds1["call_genotype_phased"]) assert np.all(~ds2["call_genotype_phased"]) @@ -279,7 +290,6 @@ def assert_bio2zarr_rt(self, tmp_path, tskit_vcz): nt.assert_array_equal( recode_plink_hets(ds1["call_genotype"].values), ds2["call_genotype"] ) - drop_fields = [ "variant_id", "variant_id_mask", @@ -297,6 +307,20 @@ def test_diploid_msprime_sim(self, tmp_path, fx_diploid_msprime_sim): def test_diploid_missing_data(self, tmp_path, fx_diploid_missing_data): self.assert_bio2zarr_rt(tmp_path, fx_diploid_missing_data) + @pytest.mark.skip("Need a different example") def test_diploid_multi_allelic(self, tmp_path, fx_diploid_multi_allelic): - with pytest.raises(ValueError, match="Only biallelic VCFs supported"): - self.assert_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic) + _, ds = self.run_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic) + + print(ds.call_genotype.values) + + nt.assert_array_equal( + ds.variant_allele.values, [["A", "G"], ["G", "T"], ["T", "C"]] + ) + nt.assert_array_equal( + ds.call_genotype.values, + [ + [[0, 1], [0, 0]], + [[1, 1], [0, 1]], + [[-1, -1], [1, 1]], + ], + ) diff --git a/vcztools/plink.py b/vcztools/plink.py index c90c9cf..3f15c1c 100644 --- a/vcztools/plink.py +++ b/vcztools/plink.py @@ -20,6 +20,12 @@ def encode_genotypes(genotypes, a12_allele=None): return bytes(_vcztools.encode_plink(G, a12_allele).data) +def new_encode_genotypes(genotypes): + G = np.asarray([genotypes], dtype=np.int8) + a12_allele = np.asarray([[1, 0]], dtype=G.dtype) + return bytes(_vcztools.encode_plink(G, a12_allele).data) + + def generate_fam(root): # TODO generate an error if sample_id contains a space sample_id = root["sample_id"][:].astype(str) @@ -66,6 +72,82 @@ def generate_bim(root, a12_allele): return df.to_csv(header=False, sep="\t", index=False) +def translate(genotypes, alleles): + copy = np.full_like(genotypes, -1) + for new, old in enumerate(alleles): + copy[genotypes == old] = new + return copy + + +def reorder_alleles(genotypes): + """ + Return a tuple (minor, major, reordered) for the specified numpy array of diploid + genotypes for a given variant. The reordered genotypes will be coded such that + 0 is the major allele and 1 is the minor allele. The returned values of minor + and major are the respective alleles in the *input* genotypes. + """ + num_samples = genotypes.shape[0] + assert genotypes.shape[1] == 2 + g = genotypes.reshape(num_samples * 2) + assert np.all(g >= -2) + max_alleles = np.max(g) + count = np.bincount(g + 2, minlength=max_alleles + 2) + # [dimension pad, missing data, alleles[0], alleles[1], ...] + count = count[2:] + if max_alleles == 1 and count[0] > count[1]: + # Common case - exit early with nothing to do + return 1, 0, genotypes + # General case + argsort = np.argsort(count) + + # a12_allele[j, 1] = 0 + major = 0 + if argsort[-1] == 0: + # print("Ref allele most frequent") + # Ref allele is most frequent - chose lowest allele from next most + # frequent class + f = count[argsort[-2]] + else: + # print("Ref allele not most frequent") + f = count[argsort[-1]] + a = 1 + while count[a] != f: + a += 1 + # a12_allele[j, 0] = a + minor = a + + # assert a12_allele[j, 0] != a12_allele[j, 1] + # if alleles[j][1] == "": + # a12_allele[j, 0] = -1 + + # if argsort[-1] == 0: + # # print("Ref allele most frequent") + # # Ref allele is most frequent - chose lowest allele from next most + # # frequent class + # f = count[argsort[-2]] + # else: + # # print("Ref allele not most frequent") + # f = count[argsort[-1]] + # a = 1 + # while count[a] != f: + # a += 1 + # minor = a + + # a12_allele[j, 0] = a + # assert a12_allele[j, 0] != a12_allele[j, 1] + # if alleles[j][1] == "": + # a12_allele[j, 0] = -1 + + # print("count = ", count, argsort) + # major = argsort[-1] + # minor = -1 + # if len(argsort) > 1: + # minor = argsort[-2] + # while minor >= 0 and count[minor] == count[argsort[-2]]: + # minor -= 1 + return minor, major, translate(genotypes, [major, minor]) + + class Writer: def __init__( self, vcz_path, bed_path, fam_path, bim_path, include=None, exclude=None @@ -90,7 +172,7 @@ def _compute_alleles(self, G, alleles): num_variants = G.shape[0] num_samples = G.shape[1] a12_allele = np.zeros((num_variants, 2), dtype=int) - 1 - for j, g in enumerate(G): + for j in range(num_variants): g = g.reshape(num_samples * 2) assert np.all(g >= -2) count = np.bincount(g + 2, minlength=max_alleles + 2) @@ -124,30 +206,54 @@ def _compute_alleles(self, G, alleles): # ) return a12_allele - def _write_genotypes(self): + def run(self): ci = retrieval.variant_chunk_iter( - self.root, fields=["call_genotype", "variant_allele"] - ) - call_genotype = self.root["call_genotype"] - a12_allele = zarr.zeros( - (call_genotype.shape[0], 2), chunks=call_genotype.chunks[0], dtype=int + self.root, + fields=[ + "call_genotype", + "variant_allele", + "variant_contig", + "variant_position", + "variant_id", + ], ) + contig_id = self.root["contig_id"][:].astype(str) + bim_rows = [] with open(self.bed_path, "wb") as bed_file: bed_file.write(bytes([0x6C, 0x1B, 0x01])) - for j, chunk in enumerate(ci): - G = chunk["call_genotype"] - a12 = self._compute_alleles(G, chunk["variant_allele"]) - buff = encode_genotypes(G, a12) - bed_file.write(buff) - a12_allele.blocks[j] = a12 - return a12_allele[:] - - def run(self): - a12_allele = self._write_genotypes() + for chunk in ci: + iterator = zip( + chunk["variant_contig"], + chunk["variant_id"], + chunk["variant_position"], + chunk["variant_allele"], + chunk["call_genotype"], + ) + for contig, variant_id, position, alleles, genotypes in iterator: + print("VAR", position, alleles, genotypes) + minor, major, genotypes = reorder_alleles(genotypes) + print("mapped:", minor, major, genotypes) + allele_1 = "0" + if minor != -1: + allele_1 = alleles[minor] + allele_2 = alleles[major] + buff = new_encode_genotypes(genotypes) + bed_file.write(buff) + bim_rows.append( + { + "Contig": contig_id[contig], + "VariantId": variant_id, + "GeneticPosition": 0, + "Position": position, + "Allele1": allele_1, + "Allele2": allele_2, + } + ) + bim_df = pd.DataFrame(bim_rows) with open(self.bim_path, "w") as f: - f.write(generate_bim(self.root, a12_allele)) + f.write(bim_df.to_csv(header=False, sep="\t", index=False)) with open(self.fam_path, "w") as f: f.write(generate_fam(self.root))