Skip to content

Commit fab64e1

Browse files
Add round-trip tests using plink's VCF output
1 parent 37d6fd5 commit fab64e1

File tree

4 files changed

+67
-21
lines changed

4 files changed

+67
-21
lines changed

bio2zarr/cli.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,9 @@ def convert_plink(
561561
samples_chunk_size,
562562
):
563563
"""
564-
In development; DO NOT USE!
564+
Convert plink fileset to VCF Zarr. Results are equivalent to
565+
`plink1.9 --bfile prefix --recode vcf-iid --out tmp` then running
566+
`vcf2zarr convert tmp.vcf zarr_path`
565567
"""
566568
setup_logging(verbose)
567569
plink.convert(

bio2zarr/plink.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
8888
gt[bed_chunk[i] == 2] = 1
8989
gt[bed_chunk[i] == 1, 0] = 1
9090

91-
yield vcz.VariantData(max(len(a) for a in alleles), alleles, gt, phased)
91+
# rlen is the length of the REF in PLINK as there's no END annotations
92+
yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
9293

9394
def generate_schema(
9495
self,
@@ -117,6 +118,9 @@ def generate_schema(
117118
f"variants={dimensions['variants'].chunk_size}, "
118119
f"samples={dimensions['samples'].chunk_size}"
119120
)
121+
# If we don't have SVLEN or END annotations, the rlen field is defined
122+
# as the length of the REF
123+
max_len = self.bed.allele_2.itemsize
120124

121125
array_specs = [
122126
vcz.ZarrArraySpec(
@@ -147,7 +151,7 @@ def generate_schema(
147151
vcz.ZarrArraySpec(
148152
source=None,
149153
name="variant_length",
150-
dtype="i4",
154+
dtype=core.min_int_dtype(0, max_len),
151155
dimensions=["variants"],
152156
description="Length of each variant",
153157
),

tests/test_cli.py

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,23 +1026,6 @@ def test_part_size_multiple_vcfs(self):
10261026
]
10271027

10281028

1029-
@pytest.mark.parametrize(
1030-
"cmd",
1031-
[
1032-
main.bio2zarr,
1033-
cli.vcf2zarr_main,
1034-
cli.plink2zarr,
1035-
cli.vcfpartition,
1036-
cli.tskit2zarr,
1037-
],
1038-
)
1039-
def test_version(cmd):
1040-
runner = ct.CliRunner()
1041-
result = runner.invoke(cmd, ["--version"], catch_exceptions=False)
1042-
s = f"version {provenance.__version__}\n"
1043-
assert result.stdout.endswith(s)
1044-
1045-
10461029
class TestTskitEndToEnd:
10471030
def test_convert(self, tmp_path):
10481031
ts_path = "tests/data/ts/example.trees"
@@ -1060,3 +1043,39 @@ def test_convert(self, tmp_path):
10601043
assert result.exit_code == 0
10611044
# Arbitrary check
10621045
assert "variant_position" in result.stdout
1046+
1047+
1048+
class TestPlinkEndToEnd:
1049+
def test_convert(self, tmp_path):
1050+
ts_path = "tests/data/plink/example"
1051+
zarr_path = tmp_path / "zarr"
1052+
runner = ct.CliRunner()
1053+
result = runner.invoke(
1054+
cli.plink2zarr,
1055+
f"convert {ts_path} {zarr_path}",
1056+
catch_exceptions=False,
1057+
)
1058+
assert result.exit_code == 0
1059+
result = runner.invoke(
1060+
cli.vcf2zarr_main, f"inspect {zarr_path}", catch_exceptions=False
1061+
)
1062+
assert result.exit_code == 0
1063+
# Arbitrary check
1064+
assert "variant_position" in result.stdout
1065+
1066+
1067+
@pytest.mark.parametrize(
1068+
"cmd",
1069+
[
1070+
main.bio2zarr,
1071+
cli.vcf2zarr_main,
1072+
cli.plink2zarr,
1073+
cli.vcfpartition,
1074+
cli.tskit2zarr,
1075+
],
1076+
)
1077+
def test_version(cmd):
1078+
runner = ct.CliRunner()
1079+
result = runner.invoke(cmd, ["--version"], catch_exceptions=False)
1080+
s = f"version {provenance.__version__}\n"
1081+
assert result.stdout.endswith(s)

tests/test_plink.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import sgkit as sg
88
import xarray.testing as xt
99

10-
from bio2zarr import plink
10+
from bio2zarr import plink, vcf
1111

1212

1313
def test_missing_dependency():
@@ -396,3 +396,24 @@ def test_variant_length(self, ds):
396396
ds.variant_length,
397397
[1, 1, 1, 1, 1, 1],
398398
)
399+
400+
401+
@pytest.mark.parametrize(
402+
"prefix",
403+
[
404+
"tests/data/plink/example",
405+
],
406+
)
407+
def test_against_plinks_vcf_output(prefix, tmp_path):
408+
vcf_path = prefix + ".vcf"
409+
plink_zarr = tmp_path / "plink.zarr"
410+
vcf_zarr = tmp_path / "vcf.zarr"
411+
plink.convert(prefix, plink_zarr)
412+
vcf.convert([vcf_path], vcf_zarr)
413+
ds1 = sg.load_dataset(plink_zarr)
414+
ds2 = (
415+
sg.load_dataset(vcf_zarr)
416+
.drop_dims("filters")
417+
.drop_vars(["variant_quality", "variant_PR", "contig_length"])
418+
)
419+
xt.assert_equal(ds1, ds2)

0 commit comments

Comments
 (0)