diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dcd71845..911489d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -104,11 +104,11 @@ jobs: python -m venv env-plink source env-plink/bin/activate python -m pip install . - python -m bio2zarr plink2zarr convert tests/data/plink/example.bed plink.vcz > plink.txt 2>&1 || echo $? > plink_exit.txt + python -m bio2zarr plink2zarr convert tests/data/plink/example plink.vcz > plink.txt 2>&1 || echo $? > plink_exit.txt test "$(cat plink_exit.txt)" = "1" grep -q "This process requires the optional bed_reader module. Install it with: pip install bio2zarr\[plink\]" plink.txt python -m pip install '.[plink]' - python -m bio2zarr plink2zarr convert tests/data/plink/example.bed plink.vcz + python -m bio2zarr plink2zarr convert tests/data/plink/example plink.vcz deactivate python -m venv env-vcf diff --git a/CHANGELOG.md b/CHANGELOG.md index 8241ad4f..ab7fae43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,8 @@ # 0.1.6 2025-0X-XX -- Make format-specific dependencies optional (#385) - -- Add contigs to plink output (#344) +- Initial version of supported plink2zarr (#390, #344, #382) -- Add variant_length and indexing to plink output (#382) +- Make format-specific dependencies optional (#385) Breaking changes @@ -14,6 +12,8 @@ Breaking changes - Add dimensions and default compressor and filter settings to the schema. (#361) +- Various changes to existing experimental plink encoding (#390) + # 0.1.5 2025-03-31 - Add support for merging contig IDs across multiple VCFs (#335) diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index a2ae88d3..6d41ed27 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -381,7 +381,7 @@ def encode( worker_processes, ): """ - Convert intermediate columnar format to vcfzarr. + Convert intermediate columnar format to VCF Zarr. """ setup_logging(verbose) check_overwrite_dir(zarr_path, force) @@ -504,7 +504,7 @@ def convert_vcf( local_alleles, ): """ - Convert input VCF(s) directly to vcfzarr (not recommended for large files). + Convert input VCF(s) directly to VCF Zarr (not recommended for large files). """ setup_logging(verbose) check_overwrite_dir(zarr_path, force) @@ -523,7 +523,7 @@ def convert_vcf( @click.group(cls=NaturalOrderGroup, name="vcf2zarr") def vcf2zarr_main(): """ - Convert VCF file(s) to the vcfzarr format. + Convert VCF file(s) to VCF Zarr format. See the online documentation at https://sgkit-dev.github.io/bio2zarr/ for more information. @@ -561,7 +561,9 @@ def convert_plink( samples_chunk_size, ): """ - In development; DO NOT USE! + Convert plink fileset to VCF Zarr. Results are equivalent to + `plink1.9 --bfile prefix --keep-allele-order --recode vcf-iid --out tmp` + then running `vcf2zarr convert tmp.vcf zarr_path` """ setup_logging(verbose) plink.convert( @@ -577,6 +579,9 @@ def convert_plink( @version @click.group() def plink2zarr(): + """ + Convert plink fileset(s) to VCF Zarr format + """ pass @@ -676,6 +681,9 @@ def convert_tskit( @version @click.group() def tskit2zarr(): + """ + Convert tskit tree sequence(s) to VCF Zarr format + """ pass diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 69ffc863..e170ddea 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -1,25 +1,46 @@ +import dataclasses import logging import pathlib import numpy as np -import zarr from bio2zarr import constants, core, vcz logger = logging.getLogger(__name__) +@dataclasses.dataclass +class PlinkPaths: + bed_path: str + bim_path: str + fam_path: str + + class PlinkFormat(vcz.Source): @core.requires_optional_dependency("bed_reader", "plink") - def __init__(self, path): + def __init__(self, prefix): import bed_reader - self._path = pathlib.Path(path) - self.bed = bed_reader.open_bed(path, num_threads=1, count_A1=False) + # TODO we will need support multiple chromosomes here to join + # plinks into on big zarr. So, these will require multiple + # bed and bim files, but should share a .fam + self.prefix = str(prefix) + paths = PlinkPaths( + self.prefix + ".bed", + self.prefix + ".bim", + self.prefix + ".fam", + ) + self.bed = bed_reader.open_bed( + paths.bed_path, + bim_location=paths.bim_path, + fam_location=paths.fam_path, + num_threads=1, + count_A1=True, + ) @property def path(self): - return self._path + return self.prefix @property def num_records(self): @@ -46,9 +67,12 @@ def iter_field(self, field_name, shape, start, stop): assert field_name == "position" # Only position field is supported from plink yield from self.bed.bp_position[start:stop] + def iter_id(self, start, stop): + yield from self.bed.sid[start:stop] + def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles): - ref_field = self.bed.allele_1 - alt_field = self.bed.allele_2 + alt_field = self.bed.allele_1 + ref_field = self.bed.allele_2 bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T gt = np.zeros(shape, dtype=np.int8) phased = np.zeros(shape[:-1], dtype=bool) @@ -61,9 +85,10 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles): gt[:] = 0 gt[bed_chunk[i] == -127] = -1 gt[bed_chunk[i] == 2] = 1 - gt[bed_chunk[i] == 1, 0] = 1 + gt[bed_chunk[i] == 1, 1] = 1 - yield vcz.VariantData(max(len(a) for a in alleles), alleles, gt, phased) + # rlen is the length of the REF in PLINK as there's no END annotations + yield vcz.VariantData(len(alleles[0]), alleles, gt, phased) def generate_schema( self, @@ -92,6 +117,9 @@ def generate_schema( f"variants={dimensions['variants'].chunk_size}, " f"samples={dimensions['samples'].chunk_size}" ) + # If we don't have SVLEN or END annotations, the rlen field is defined + # as the length of the REF + max_len = self.bed.allele_2.itemsize array_specs = [ vcz.ZarrArraySpec( @@ -107,10 +135,22 @@ def generate_schema( dimensions=["variants", "alleles"], description=None, ), + vcz.ZarrArraySpec( + name="variant_id", + dtype="O", + dimensions=["variants"], + description=None, + ), + vcz.ZarrArraySpec( + name="variant_id_mask", + dtype="bool", + dimensions=["variants"], + description=None, + ), vcz.ZarrArraySpec( source=None, name="variant_length", - dtype="i4", + dtype=core.min_int_dtype(0, max_len), dimensions=["variants"], description="Length of each variant", ), @@ -147,20 +187,20 @@ def generate_schema( def convert( - bed_path, - zarr_path, + prefix, + out, *, variants_chunk_size=None, samples_chunk_size=None, worker_processes=1, show_progress=False, ): - plink_format = PlinkFormat(bed_path) + plink_format = PlinkFormat(prefix) schema_instance = plink_format.generate_schema( variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, ) - zarr_path = pathlib.Path(zarr_path) + zarr_path = pathlib.Path(out) vzw = vcz.VcfZarrWriter(PlinkFormat, zarr_path) # Rough heuristic to split work up enough to keep utilisation high target_num_partitions = max(1, worker_processes * 4) @@ -175,39 +215,3 @@ def convert( ) vzw.finalise(show_progress) vzw.create_index() - - -# FIXME do this more efficiently - currently reading the whole thing -# in for convenience, and also comparing call-by-call -@core.requires_optional_dependency("bed_reader", "plink") -def validate(bed_path, zarr_path): - import bed_reader - - root = zarr.open(store=zarr_path, mode="r") - call_genotype = root["call_genotype"][:] - - bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1) - - assert call_genotype.shape[0] == bed.sid_count - assert call_genotype.shape[1] == bed.iid_count - bed_genotypes = bed.read(dtype="int8").T - assert call_genotype.shape[0] == bed_genotypes.shape[0] - assert call_genotype.shape[1] == bed_genotypes.shape[1] - assert call_genotype.shape[2] == 2 - - row_id = 0 - for bed_row, zarr_row in zip(bed_genotypes, call_genotype): - # print("ROW", row_id) - # print(bed_row, zarr_row) - row_id += 1 - for bed_call, zarr_call in zip(bed_row, zarr_row): - if bed_call == -127: - assert list(zarr_call) == [-1, -1] - elif bed_call == 0: - assert list(zarr_call) == [0, 0] - elif bed_call == 1: - assert list(zarr_call) == [1, 0] - elif bed_call == 2: - assert list(zarr_call) == [1, 1] - else: # pragma no cover - raise AssertionError(f"Unexpected bed call {bed_call}") diff --git a/docs/_toc.yml b/docs/_toc.yml index abc53837..007cfea5 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -6,6 +6,9 @@ chapters: sections: - file: vcf2zarr/tutorial - file: vcf2zarr/cli_ref +- file: plink2zarr/overview + sections: + - file: plink2zarr/cli_ref - file: vcfpartition/overview sections: - file: vcfpartition/cli_ref diff --git a/docs/intro.md b/docs/intro.md index eef9ea63..8b563fae 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -8,6 +8,9 @@ - {ref}`sec-vcf2zarr` converts VCF data to [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format. +- {ref}`sec-plink2zarr` converts PLINK 1.0 data to + [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format. + - {ref}`sec-vcfpartition` is a utility to split an input VCF into a given number of partitions. This is useful for parallel processing of VCF data. @@ -17,10 +20,8 @@ `bio2zarr` is in development, contributions, feedback and issues are welcome at the [GitHub repository](https://github.com/sgkit-dev/bio2zarr). -Support for converting PLINK data to VCF Zarr is partially implemented, -and adding BGEN and [tskit](https://tskit.dev/) support is also planned. If you would like to see -support for other formats (or an interested in helping with implementing), +support for other formats such as BGEN (or an interested in helping with implementing), please open an [issue on Github](https://github.com/sgkit-dev/bio2zarr/issues) to discuss! diff --git a/docs/plink2zarr/cli_ref.md b/docs/plink2zarr/cli_ref.md new file mode 100644 index 00000000..2e13db25 --- /dev/null +++ b/docs/plink2zarr/cli_ref.md @@ -0,0 +1,17 @@ +(sec-plink2zarr-cli-ref)= +# CLI Reference + +% A note on cross references... There's some weird long-standing problem with +% cross referencing program values in Sphinx, which means that we can't use +% the built-in labels generated by sphinx-click. We can make our own explicit +% targets, but these have to have slightly weird names to avoid conflicting +% with what sphinx-click is doing. So, hence the cmd- prefix. +% Based on: https://github.com/skypilot-org/skypilot/pull/2834 + +```{eval-rst} + +.. _cmd-plink2zarr-convert: +.. click:: bio2zarr.cli:convert_plink + :prog: plink2zarr convert + :nested: full + diff --git a/docs/plink2zarr/overview.md b/docs/plink2zarr/overview.md new file mode 100644 index 00000000..cb156fdd --- /dev/null +++ b/docs/plink2zarr/overview.md @@ -0,0 +1,38 @@ +(sec-plink2zarr)= +# plink2zarr + +Convert plink data to the +[VCF Zarr specification](https://github.com/sgkit-dev/vcf-zarr-spec/) +reliably in parallel. + +See {ref}`sec-plink2zarr-cli-ref` for detailed documentation on +command line options. + +Conversion of the plink data model to VCF follows the semantics of plink1.9 as closely +as possible. That is, given a binary plink fileset with prefix "fileset" (i.e., +fileset.bed, fileset.bim, fileset.fam), running +``` +$ plink2zarr convert fileset out.vcz +``` +should produce the same result in ``out.vcz`` as +``` +$ plink1.9 --bfile fileset --keep-allele-order --recode vcf-iid --out tmp +$ vcf2zarr convert tmp.vcf out.vcz +``` + +:::{warning} +It is important to note that we follow the same conventions as plink 2.0 +where the A1 allele in the [bim file](https://www.cog-genomics.org/plink/2.0/formats#bim) +is the VCF ALT and A2 is the REF. +::: + +:::{note} +Currently we only convert the basic VCF-like data from plink, and don't include +phenotypes and pedigree information. These are planned as future enhancements. +Please comment on [this issue](https://github.com/sgkit-dev/bio2zarr/issues/392) +if you are interested in this functionality. +::: + + + + diff --git a/tests/data/plink/example.vcf b/tests/data/plink/example.vcf new file mode 100644 index 00000000..719b9f7d --- /dev/null +++ b/tests/data/plink/example.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##fileDate=20250521 +##source=PLINKv1.90 +##contig= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ind0 ind1 ind2 ind3 ind4 ind5 ind6 ind7 ind8 ind9 +1 10 1_10 GG A . . PR GT 1/1 1/1 1/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0 +1 20 1_20 C TTT . . PR GT 1/1 1/1 1/1 1/1 0/0 0/0 0/0 0/0 0/0 0/0 diff --git a/tests/data/plink/example_with_fam.vcf b/tests/data/plink/example_with_fam.vcf new file mode 100644 index 00000000..9784ba22 --- /dev/null +++ b/tests/data/plink/example_with_fam.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##fileDate=20250521 +##source=PLINKv1.90 +##contig= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ind0 ind1 ind2 ind3 ind4 ind5 ind6 ind7 ind8 ind9 +1 10 1_10 G A . . PR GT 1/1 1/1 1/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0 +1 20 1_20 C T . . PR GT 1/1 1/1 1/1 1/1 0/0 0/0 0/0 0/0 0/0 0/0 diff --git a/tests/data/plink/plink_sim_10s_100v_10pmiss.vcf b/tests/data/plink/plink_sim_10s_100v_10pmiss.vcf new file mode 100644 index 00000000..ac8bc4ce --- /dev/null +++ b/tests/data/plink/plink_sim_10s_100v_10pmiss.vcf @@ -0,0 +1,107 @@ +##fileformat=VCFv4.2 +##fileDate=20250521 +##source=PLINKv1.90 +##contig= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 000 001 002 003 004 005 006 007 008 009 +1 1 1:1:G:CGCGCG G CGCGCG . . PR GT 0/1 1/1 0/1 0/1 0/0 0/0 1/1 1/1 0/1 1/1 +1 2 1:2:ACT:G ACT G . . PR GT 0/1 0/0 1/1 0/1 0/0 0/1 0/1 0/1 0/1 1/1 +1 3 1:3:ACT:G ACT G . . PR GT 0/1 ./. 1/1 1/1 1/1 0/0 0/1 0/1 0/1 0/1 +1 4 1:4:G:CGCGCG G CGCGCG . . PR GT 0/1 0/1 ./. 0/0 0/1 1/1 1/1 0/1 1/1 0/1 +1 5 1:5:G:CGCGCG G CGCGCG . . PR GT ./. 0/0 1/1 1/1 0/1 1/1 ./. 0/0 0/1 0/0 +1 6 1:6:ACT:G ACT G . . PR GT 0/0 1/1 0/0 0/0 1/1 0/1 0/1 1/1 0/1 1/1 +1 7 1:7:G:CGCGCG G CGCGCG . . PR GT 0/1 0/0 0/1 0/1 0/0 0/1 0/1 0/1 ./. 0/0 +1 8 1:8:T:GTGG T GTGG . . PR GT ./. 1/1 0/1 1/1 1/1 1/1 ./. 1/1 0/1 1/1 +1 9 1:9:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 1/1 1/1 1/1 1/1 ./. 1/1 1/1 +1 10 1:10:A:C A C . . PR GT 0/1 ./. 0/1 0/1 0/1 0/1 ./. 0/1 0/1 1/1 +1 11 1:11:ACT:G ACT G . . PR GT 1/1 1/1 1/1 1/1 ./. 1/1 1/1 ./. 1/1 ./. +1 12 1:12:G:CGCGCG G CGCGCG . . PR GT 1/1 1/1 ./. 0/0 0/1 1/1 0/1 ./. 0/0 1/1 +1 13 1:13:G:CGCGCG G CGCGCG . . PR GT 1/1 1/1 1/1 1/1 1/1 0/1 1/1 1/1 1/1 1/1 +1 14 1:14:T:GTGG T GTGG . . PR GT 1/1 ./. 0/1 1/1 0/1 1/1 ./. 0/1 ./. ./. +1 15 1:15:ACT:G ACT G . . PR GT 0/1 1/1 0/0 1/1 1/1 1/1 0/0 1/1 0/1 0/1 +1 16 1:16:A:C A C . . PR GT 0/0 0/0 0/0 ./. 0/0 0/0 0/0 0/0 0/0 0/0 +1 17 1:17:ACT:G ACT G . . PR GT 1/1 1/1 1/1 0/1 0/1 1/1 0/0 ./. 1/1 1/1 +1 18 1:18:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 1/1 1/1 1/1 1/1 1/1 1/1 1/1 +1 19 1:19:A:C A C . . PR GT 0/1 0/1 0/0 0/1 0/1 ./. 1/1 0/1 1/1 0/1 +1 20 1:20:A:C A C . . PR GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 ./. 0/1 0/0 +1 21 1:21:T:GTGG T GTGG . . PR GT 1/1 0/1 1/1 1/1 1/1 1/1 0/1 0/1 1/1 1/1 +1 22 1:22:G:CGCGCG G CGCGCG . . PR GT 0/1 0/1 0/1 ./. 1/1 1/1 ./. 1/1 0/1 ./. +1 23 1:23:T:GTGG T GTGG . . PR GT 1/1 0/1 0/1 ./. ./. 0/1 0/1 0/1 1/1 0/1 +1 24 1:24:A:C A C . . PR GT 0/0 0/0 0/1 0/0 0/1 0/1 0/0 0/0 0/0 0/0 +1 25 1:25:A:C A C . . PR GT 0/0 0/0 ./. ./. ./. ./. 0/0 0/1 0/1 0/0 +1 26 1:26:ACT:G ACT G . . PR GT 0/1 1/1 1/1 0/1 1/1 ./. 1/1 0/1 0/1 1/1 +1 27 1:27:G:CGCGCG G CGCGCG . . PR GT 0/1 0/0 0/0 0/0 ./. 0/0 ./. 0/1 0/0 0/0 +1 28 1:28:ACT:G ACT G . . PR GT 0/0 1/1 0/0 0/1 0/0 ./. 0/1 0/0 0/1 1/1 +1 29 1:29:T:GTGG T GTGG . . PR GT 1/1 0/1 1/1 1/1 ./. ./. ./. 1/1 0/1 1/1 +1 30 1:30:A:C A C . . PR GT 0/0 0/0 0/0 0/1 0/0 0/1 0/1 0/0 0/1 0/0 +1 31 1:31:T:GTGG T GTGG . . PR GT 0/0 1/1 1/1 1/1 1/1 0/1 1/1 ./. 0/1 1/1 +1 32 1:32:G:CGCGCG G CGCGCG . . PR GT 0/1 0/1 0/0 0/0 ./. 0/0 1/1 ./. 0/0 0/1 +1 33 1:33:ACT:G ACT G . . PR GT 0/1 0/1 0/1 0/1 1/1 0/1 1/1 0/1 0/1 1/1 +1 34 1:34:G:CGCGCG G CGCGCG . . PR GT 0/1 1/1 0/1 ./. 0/1 ./. 0/1 0/0 0/0 ./. +1 35 1:35:A:C A C . . PR GT 0/0 0/1 0/0 0/0 0/0 0/0 0/0 0/0 ./. 0/1 +1 36 1:36:G:CGCGCG G CGCGCG . . PR GT ./. 0/0 ./. 1/1 1/1 1/1 1/1 1/1 0/1 1/1 +1 37 1:37:T:GTGG T GTGG . . PR GT 1/1 0/1 0/1 1/1 0/1 1/1 1/1 1/1 1/1 1/1 +1 38 1:38:A:C A C . . PR GT 0/1 0/0 0/0 ./. 1/1 1/1 ./. 0/0 0/1 0/0 +1 39 1:39:A:C A C . . PR GT 0/0 0/0 0/1 0/1 0/1 0/0 0/0 0/0 0/1 0/0 +1 40 1:40:T:GTGG T GTGG . . PR GT 1/1 1/1 0/0 0/1 ./. 0/1 1/1 0/1 1/1 1/1 +1 41 1:41:A:C A C . . PR GT 0/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 +1 42 1:42:G:CGCGCG G CGCGCG . . PR GT 0/1 ./. 0/1 0/1 0/1 1/1 0/1 0/1 1/1 0/0 +1 43 1:43:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 1/1 ./. 1/1 1/1 1/1 ./. 1/1 +1 44 1:44:ACT:G ACT G . . PR GT 1/1 0/0 1/1 1/1 1/1 1/1 1/1 1/1 1/1 1/1 +1 45 1:45:G:CGCGCG G CGCGCG . . PR GT 0/0 1/1 1/1 1/1 0/1 1/1 1/1 ./. 1/1 0/1 +1 46 1:46:ACT:G ACT G . . PR GT 1/1 1/1 1/1 0/1 0/1 1/1 0/1 1/1 0/1 1/1 +1 47 1:47:G:CGCGCG G CGCGCG . . PR GT 1/1 0/1 ./. 0/1 1/1 1/1 0/1 0/1 0/1 0/0 +1 48 1:48:A:C A C . . PR GT 0/1 0/0 0/1 ./. 0/0 0/0 0/1 0/0 0/0 0/0 +1 49 1:49:A:C A C . . PR GT 0/0 0/1 0/0 0/1 0/0 0/0 0/0 0/0 ./. 0/0 +1 50 1:50:A:C A C . . PR GT 0/1 ./. ./. ./. 0/0 0/0 ./. ./. 0/0 0/1 +1 51 1:51:G:CGCGCG G CGCGCG . . PR GT 0/1 0/0 1/1 0/0 0/0 0/1 0/1 0/1 0/1 0/0 +1 52 1:52:A:C A C . . PR GT 0/0 0/0 0/0 0/1 0/0 0/0 0/1 ./. ./. 0/1 +1 53 1:53:ACT:G ACT G . . PR GT 1/1 ./. 0/1 1/1 1/1 ./. 1/1 0/1 0/1 0/0 +1 54 1:54:A:C A C . . PR GT 0/1 ./. 0/0 0/1 ./. ./. ./. 1/1 0/0 0/0 +1 55 1:55:G:CGCGCG G CGCGCG . . PR GT 0/1 1/1 0/1 1/1 0/1 0/1 ./. 0/1 1/1 0/1 +1 56 1:56:T:GTGG T GTGG . . PR GT 1/1 0/1 ./. 0/1 1/1 0/1 0/1 1/1 0/0 ./. +1 57 1:57:G:CGCGCG G CGCGCG . . PR GT 0/1 0/0 1/1 0/1 0/1 ./. 0/0 1/1 0/0 1/1 +1 58 1:58:A:C A C . . PR GT 1/1 1/1 0/1 1/1 1/1 0/1 0/0 0/1 1/1 0/1 +1 59 1:59:T:GTGG T GTGG . . PR GT 0/1 0/1 1/1 1/1 ./. 0/1 0/1 ./. 0/1 0/1 +1 60 1:60:G:CGCGCG G CGCGCG . . PR GT 1/1 0/1 0/1 0/1 0/0 1/1 0/0 0/1 0/1 1/1 +1 61 1:61:ACT:G ACT G . . PR GT 1/1 1/1 1/1 1/1 1/1 1/1 1/1 0/1 1/1 ./. +1 62 1:62:A:C A C . . PR GT 1/1 0/0 0/0 0/0 0/0 ./. 0/0 0/1 0/1 ./. +1 63 1:63:G:CGCGCG G CGCGCG . . PR GT 0/1 ./. 0/0 1/1 0/1 ./. 0/1 1/1 0/1 1/1 +1 64 1:64:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 1/1 0/1 1/1 1/1 1/1 1/1 1/1 +1 65 1:65:T:GTGG T GTGG . . PR GT 0/1 1/1 1/1 1/1 0/1 0/1 1/1 0/1 1/1 1/1 +1 66 1:66:ACT:G ACT G . . PR GT 1/1 0/0 0/1 0/0 0/0 0/1 0/0 0/1 0/1 0/0 +1 67 1:67:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 1/1 1/1 ./. 1/1 1/1 1/1 0/1 +1 68 1:68:ACT:G ACT G . . PR GT 0/1 1/1 1/1 ./. 1/1 0/1 1/1 0/1 1/1 0/1 +1 69 1:69:G:CGCGCG G CGCGCG . . PR GT 0/1 0/0 0/1 0/0 0/1 0/0 0/0 0/1 0/1 0/0 +1 70 1:70:G:CGCGCG G CGCGCG . . PR GT 0/1 1/1 1/1 0/1 0/0 1/1 1/1 1/1 0/0 0/1 +1 71 1:71:ACT:G ACT G . . PR GT 0/1 1/1 0/1 0/0 0/0 0/1 1/1 1/1 1/1 0/1 +1 72 1:72:G:CGCGCG G CGCGCG . . PR GT 1/1 0/1 1/1 0/1 ./. ./. 1/1 1/1 0/0 0/1 +1 73 1:73:A:C A C . . PR GT 0/0 ./. 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 +1 74 1:74:A:C A C . . PR GT 0/0 0/0 0/1 0/0 0/0 0/0 1/1 1/1 0/0 0/1 +1 75 1:75:T:GTGG T GTGG . . PR GT 1/1 0/1 0/1 1/1 1/1 1/1 0/1 1/1 1/1 1/1 +1 76 1:76:A:C A C . . PR GT 0/0 0/0 ./. 0/0 0/0 0/0 0/0 0/0 0/0 0/0 +1 77 1:77:ACT:G ACT G . . PR GT ./. 1/1 1/1 1/1 0/1 0/1 ./. 1/1 1/1 1/1 +1 78 1:78:ACT:G ACT G . . PR GT 1/1 0/1 1/1 ./. 1/1 1/1 1/1 1/1 1/1 1/1 +1 79 1:79:A:C A C . . PR GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/1 0/0 0/1 +1 80 1:80:A:C A C . . PR GT 0/0 0/0 0/0 ./. 0/0 0/0 0/1 0/0 ./. 0/0 +1 81 1:81:A:C A C . . PR GT 0/0 0/1 ./. 0/1 ./. 0/0 0/0 1/1 0/1 0/1 +1 82 1:82:T:GTGG T GTGG . . PR GT 1/1 1/1 0/1 1/1 1/1 1/1 0/1 1/1 0/1 1/1 +1 83 1:83:A:C A C . . PR GT 0/0 0/1 0/0 0/0 1/1 ./. 0/0 0/0 0/1 0/1 +1 84 1:84:ACT:G ACT G . . PR GT 1/1 ./. 1/1 1/1 1/1 1/1 0/1 1/1 1/1 1/1 +1 85 1:85:A:C A C . . PR GT 0/0 0/1 0/0 0/1 0/0 0/0 0/0 0/0 0/0 0/0 +1 86 1:86:G:CGCGCG G CGCGCG . . PR GT 0/0 0/0 1/1 0/1 0/0 0/1 0/1 0/1 0/1 1/1 +1 87 1:87:ACT:G ACT G . . PR GT 0/1 1/1 1/1 1/1 0/1 1/1 1/1 0/1 1/1 1/1 +1 88 1:88:A:C A C . . PR GT 0/1 0/1 0/1 0/0 ./. ./. 0/0 0/1 0/0 0/0 +1 89 1:89:A:C A C . . PR GT 0/0 0/0 0/1 0/0 0/0 ./. 0/0 0/0 0/0 1/1 +1 90 1:90:T:GTGG T GTGG . . PR GT 0/1 0/1 0/0 0/1 ./. 0/1 0/1 1/1 0/0 0/1 +1 91 1:91:T:GTGG T GTGG . . PR GT 0/1 1/1 1/1 0/1 1/1 1/1 1/1 ./. 1/1 0/1 +1 92 1:92:T:GTGG T GTGG . . PR GT ./. 1/1 1/1 0/1 1/1 0/1 0/1 0/1 1/1 1/1 +1 93 1:93:A:C A C . . PR GT 0/1 0/1 0/1 0/1 0/0 0/0 ./. ./. 0/0 0/0 +1 94 1:94:A:C A C . . PR GT 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/1 0/1 +1 95 1:95:A:C A C . . PR GT 0/1 0/0 0/1 0/1 0/0 0/0 0/0 0/1 0/1 0/0 +1 96 1:96:A:C A C . . PR GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 ./. +1 97 1:97:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 ./. 1/1 1/1 1/1 1/1 ./. 1/1 +1 98 1:98:ACT:G ACT G . . PR GT 1/1 0/1 0/0 0/0 1/1 0/1 0/1 0/1 0/1 1/1 +1 99 1:99:T:GTGG T GTGG . . PR GT 0/1 1/1 1/1 ./. 1/1 1/1 0/1 1/1 0/1 1/1 +1 100 1:100:A:C A C . . PR GT 0/1 1/1 0/0 0/1 0/1 ./. ./. 0/0 0/0 0/0 diff --git a/tests/test_cli.py b/tests/test_cli.py index 2cc133be..841db800 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1026,23 +1026,6 @@ def test_part_size_multiple_vcfs(self): ] -@pytest.mark.parametrize( - "cmd", - [ - main.bio2zarr, - cli.vcf2zarr_main, - cli.plink2zarr, - cli.vcfpartition, - cli.tskit2zarr, - ], -) -def test_version(cmd): - runner = ct.CliRunner() - result = runner.invoke(cmd, ["--version"], catch_exceptions=False) - s = f"version {provenance.__version__}\n" - assert result.stdout.endswith(s) - - class TestTskitEndToEnd: def test_convert(self, tmp_path): ts_path = "tests/data/ts/example.trees" @@ -1060,3 +1043,39 @@ def test_convert(self, tmp_path): assert result.exit_code == 0 # Arbitrary check assert "variant_position" in result.stdout + + +class TestPlinkEndToEnd: + def test_convert(self, tmp_path): + ts_path = "tests/data/plink/example" + zarr_path = tmp_path / "zarr" + runner = ct.CliRunner() + result = runner.invoke( + cli.plink2zarr, + f"convert {ts_path} {zarr_path}", + catch_exceptions=False, + ) + assert result.exit_code == 0 + result = runner.invoke( + cli.vcf2zarr_main, f"inspect {zarr_path}", catch_exceptions=False + ) + assert result.exit_code == 0 + # Arbitrary check + assert "variant_position" in result.stdout + + +@pytest.mark.parametrize( + "cmd", + [ + main.bio2zarr, + cli.vcf2zarr_main, + cli.plink2zarr, + cli.vcfpartition, + cli.tskit2zarr, + ], +) +def test_version(cmd): + runner = ct.CliRunner() + result = runner.invoke(cmd, ["--version"], catch_exceptions=False) + s = f"version {provenance.__version__}\n" + assert result.stdout.endswith(s) diff --git a/tests/test_core.py b/tests/test_core.py index 55abc908..eac1fe62 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -237,7 +237,9 @@ def test_examples(self, chunk_size, size, start, stop): # It works in CI on Linux, but it'll probably break at some point. # It's also necessary to update these numbers each time a new data # file gets added - ("tests/data", 5045032), + # NOTE: removed this after additional plink files broke it in #390. + # It's not worth the trouble - see #330 + # ("tests/data", 5053610), ("tests/data/vcf", 5018640), ("tests/data/vcf/sample.vcf.gz", 1089), ], diff --git a/tests/test_plink.py b/tests/test_plink.py index 6377776e..661b62f6 100644 --- a/tests/test_plink.py +++ b/tests/test_plink.py @@ -5,10 +5,26 @@ import numpy.testing as nt import pytest import sgkit as sg -import sgkit.io.plink import xarray.testing as xt +import zarr -from bio2zarr import plink +from bio2zarr import plink, vcf + + +def test_missing_dependency(): + with mock.patch( + "importlib.import_module", + side_effect=ImportError("No module named 'bed_reader'"), + ): + with pytest.raises(ImportError) as exc_info: + plink.convert( + "UNUSED_PATH", + "UNUSED_PATH", + ) + assert ( + "This process requires the optional bed_reader module. " + "Install it with: pip install bio2zarr[plink]" in str(exc_info.value) + ) class TestSmallExample: @@ -17,8 +33,6 @@ def bed_path(self, tmp_path_factory): tmp_path = tmp_path_factory.mktemp("data") path = tmp_path / "example.bed" # 7 sites x 3 samples - # These are counts of allele 1, so we have to flip them - # to get the values we expect dosages = np.array( [ [0, 1, 2], @@ -31,8 +45,17 @@ def bed_path(self, tmp_path_factory): ], dtype=np.int8, ) - bed_reader.to_bed(path, dosages.T) - return path + m = 7 + d = { + "chromosome": ["chr1"] * m, + "sid": [f"id{j}" for j in range(m)], + "bp_position": range(1, m + 1), + "allele_1": ["A"] * m, + "allele_2": ["T"] * m, + "iid": [f"s{j}" for j in range(3)], + } + bed_reader.to_bed(path, dosages.T, properties=d) + return path.with_suffix("") @pytest.fixture(scope="class") def ds(self, tmp_path_factory, bed_path): @@ -47,47 +70,31 @@ def test_genotypes(self, ds): nt.assert_array_equal( call_genotype, [ - [[1, 1], [1, 0], [0, 0]], - [[1, 0], [1, 1], [0, 0]], - [[1, 1], [1, 1], [1, 1]], - [[-1, -1], [1, 1], [1, 1]], - [[0, 0], [1, 1], [1, 1]], - [[-1, -1], [-1, -1], [-1, -1]], + [[0, 0], [0, 1], [1, 1]], + [[0, 1], [0, 0], [1, 1]], [[0, 0], [0, 0], [0, 0]], + [[-1, -1], [0, 0], [0, 0]], + [[1, 1], [0, 0], [0, 0]], + [[-1, -1], [-1, -1], [-1, -1]], + [[1, 1], [1, 1], [1, 1]], ], ) - def test_missing_dependency(self): - with mock.patch( - "importlib.import_module", - side_effect=ImportError("No module named 'bed_reader'"), - ): - with pytest.raises(ImportError) as exc_info: - plink.convert( - "UNUSED_PATH", - "UNUSED_PATH", - ) - assert ( - "This process requires the optional bed_reader module. " - "Install it with: pip install bio2zarr[plink]" in str(exc_info.value) - ) + def test_variant_position(self, ds): + nt.assert_array_equal(ds["variant_position"], np.arange(1, 8)) + + def test_variant_contig(self, ds): + nt.assert_array_equal(ds["variant_contig"], np.zeros(7)) + nt.assert_array_equal(ds["contig_id"], ["chr1"]) + def test_variant_id(self, ds): + nt.assert_array_equal(ds["variant_id"], [f"id{j}" for j in range(7)]) -class TestEqualSgkit: - def test_simulated_example(self, tmp_path): - data_path = "tests/data/plink/" - bed_path = data_path + "plink_sim_10s_100v_10pmiss.bed" - fam_path = data_path + "plink_sim_10s_100v_10pmiss.fam" - bim_path = data_path + "plink_sim_10s_100v_10pmiss.bim" - # print(bed_path) - # print(fam_path) - sg_ds = sgkit.io.plink.read_plink( - bed_path=bed_path, fam_path=fam_path, bim_path=bim_path - ) - out = tmp_path / "example.plink.zarr" - plink.convert(bed_path, out) - ds = sg.load_dataset(out) - nt.assert_array_equal(ds.call_genotype.values, sg_ds.call_genotype.values) + def test_variant_allele(self, ds): + nt.assert_array_equal(ds["variant_allele"], [["T", "A"] for _ in range(7)]) + + def test_sample_id(self, ds): + nt.assert_array_equal(ds["sample_id"], ["s0", "s1", "s2"]) class TestExample: @@ -105,11 +112,15 @@ class TestExample: Base-pair coordinate (1-based; limited to 231-2) Allele 1 (corresponding to clear bits in .bed; usually minor) Allele 2 (corresponding to set bits in .bed; usually major) + + + Note, the VCF interpretation here can be derived by running + plink1.9 --bfile tests/data/plink/example --export vcf """ @pytest.fixture(scope="class") def ds(self, tmp_path_factory): - path = "tests/data/plink/example.bed" + path = "tests/data/plink/example" out = tmp_path_factory.mktemp("data") / "example.plink.zarr" plink.convert(path, out) return sg.load_dataset(out) @@ -121,10 +132,10 @@ def test_variant_position(self, ds): nt.assert_array_equal(ds.variant_position, [10, 20]) def test_variant_allele(self, ds): - nt.assert_array_equal(ds.variant_allele, [["A", "GG"], ["TTT", "C"]]) + nt.assert_array_equal(ds.variant_allele, [["GG", "A"], ["C", "TTT"]]) def test_variant_length(self, ds): - nt.assert_array_equal(ds.variant_length, [2, 3]) + nt.assert_array_equal(ds.variant_length, [2, 1]) def test_contig_id(self, ds): """Test that contig identifiers are correctly extracted and stored.""" @@ -143,47 +154,37 @@ def test_genotypes(self, ds): call_genotype, [ [ - [0, 0], - [0, 0], - [0, 0], - [1, 1], - [1, 1], - [1, 1], [1, 1], [1, 1], [1, 1], - [1, 1], - ], - [ [0, 0], [0, 0], [0, 0], [0, 0], + [0, 0], + [0, 0], + [0, 0], + ], + [ [1, 1], [1, 1], [1, 1], [1, 1], - [1, 1], - [1, 1], + [0, 0], + [0, 0], + [0, 0], + [0, 0], + [0, 0], + [0, 0], ], ], ) - def test_sgkit(self, ds): - path = "tests/data/plink/example" - sg_ds = sgkit.io.plink.read_plink(path=path) - # Can't compare the full dataset yet - nt.assert_array_equal(ds.call_genotype.values, sg_ds.call_genotype.values) - # https://github.com/pystatgen/sgkit/issues/1209 - nt.assert_array_equal(ds.variant_allele, sg_ds.variant_allele.astype(str)) - nt.assert_array_equal(ds.variant_position, sg_ds.variant_position) - nt.assert_array_equal(ds.sample_id, sg_ds.sample_id) - class TestSimulatedExample: @pytest.fixture(scope="class") def ds(self, tmp_path_factory): - path = "tests/data/plink/plink_sim_10s_100v_10pmiss.bed" + path = "tests/data/plink/plink_sim_10s_100v_10pmiss" out = tmp_path_factory.mktemp("data") / "example.plink.zarr" plink.convert(path, out) return sg.load_dataset(out) @@ -207,16 +208,16 @@ def test_genotypes(self, ds): ) expected = np.array( [ - [1, 0], - [1, 0], - [0, 0], - [0, 0], - [-1, -1], + [0, 1], + [0, 1], [1, 1], [1, 1], + [-1, -1], + [0, 0], [0, 0], [1, 1], - [1, 1], + [0, 0], + [0, 0], ] ) gt = ds["call_genotype"].values @@ -259,7 +260,7 @@ def test_contig_arrays(self, ds): def test_chunk_size( self, ds, tmp_path, variants_chunk_size, samples_chunk_size, worker_processes ): - path = "tests/data/plink/plink_sim_10s_100v_10pmiss.bed" + path = "tests/data/plink/plink_sim_10s_100v_10pmiss" out = tmp_path / "example.zarr" plink.convert( path, @@ -276,6 +277,37 @@ def test_chunk_size( # TODO check array chunks +def validate(bed_path, zarr_path): + root = zarr.open(store=zarr_path, mode="r") + call_genotype = root["call_genotype"][:] + + bed = bed_reader.open_bed(bed_path + ".bed", count_A1=True, num_threads=1) + + assert call_genotype.shape[0] == bed.sid_count + assert call_genotype.shape[1] == bed.iid_count + bed_genotypes = bed.read(dtype="int8").T + assert call_genotype.shape[0] == bed_genotypes.shape[0] + assert call_genotype.shape[1] == bed_genotypes.shape[1] + assert call_genotype.shape[2] == 2 + + row_id = 0 + for bed_row, zarr_row in zip(bed_genotypes, call_genotype): + # print("ROW", row_id) + # print(bed_row, zarr_row) + row_id += 1 + for bed_call, zarr_call in zip(bed_row, zarr_row): + if bed_call == -127: + assert list(zarr_call) == [-1, -1] + elif bed_call == 0: + assert list(zarr_call) == [0, 0] + elif bed_call == 1: + assert list(zarr_call) == [0, 1] + elif bed_call == 2: + assert list(zarr_call) == [1, 1] + else: # pragma no cover + raise AssertionError(f"Unexpected bed call {bed_call}") + + @pytest.mark.parametrize( ("variants_chunk_size", "samples_chunk_size"), [ @@ -290,7 +322,7 @@ def test_chunk_size( def test_by_validating( tmp_path, variants_chunk_size, samples_chunk_size, worker_processes ): - path = "tests/data/plink/plink_sim_10s_100v_10pmiss.bed" + path = "tests/data/plink/plink_sim_10s_100v_10pmiss" out = tmp_path / "example.zarr" plink.convert( path, @@ -299,7 +331,7 @@ def test_by_validating( samples_chunk_size=samples_chunk_size, worker_processes=worker_processes, ) - plink.validate(path, out) + validate(path, out) class TestMultipleContigs: @@ -314,7 +346,6 @@ def multi_contig_bed_path(self, tmp_path_factory): # - 1 variant on chromosome 2 # - 2 variants on chromosome X # - 1 variant on chromosome Y - # values are counts of allele 1, will be flipped by the converter dosages = np.array( [ [0, 1, 2, 0], # chr1 @@ -344,7 +375,7 @@ def multi_contig_bed_path(self, tmp_path_factory): # Format: fam_id, ind_id, pat_id, mat_id, sex, phenotype f.write(f"fam{i} ind{i} 0 0 0 -9\n") - return path + return path.with_suffix("") @pytest.fixture(scope="class") def ds(self, tmp_path_factory, multi_contig_bed_path): @@ -356,6 +387,19 @@ def ds(self, tmp_path_factory, multi_contig_bed_path): def test_contig_ids(self, ds): nt.assert_array_equal(ds.contig_id, ["1", "2", "X", "Y"]) + def test_variant_allele(self, ds): + nt.assert_array_equal( + ds.variant_allele, + [ + ["G", "A"], + ["C", "T"], + ["G", "A"], + ["T", "C"], + ["A", "G"], + ["C", "T"], + ], + ) + def test_variant_contig(self, ds): nt.assert_array_equal( ds.variant_contig, [0, 0, 1, 2, 2, 3] @@ -367,12 +411,12 @@ def test_genotypes(self, ds): nt.assert_array_equal( call_genotype, [ - [[1, 1], [1, 0], [0, 0], [1, 1]], # chr1 - [[1, 0], [1, 1], [0, 0], [1, 0]], # chr1 - [[1, 1], [1, 1], [1, 1], [0, 0]], # chr2 - [[0, 0], [1, 1], [1, 0], [1, 1]], # chrX - [[1, 0], [1, 0], [1, 0], [1, 0]], # chrX - [[1, 1], [0, 0], [1, 1], [0, 0]], # chrY + [[0, 0], [0, 1], [1, 1], [0, 0]], # chr1 + [[0, 1], [0, 0], [1, 1], [0, 1]], # chr1 + [[0, 0], [0, 0], [0, 0], [1, 1]], # chr2 + [[1, 1], [0, 0], [0, 1], [0, 0]], # chrX + [[0, 1], [0, 1], [0, 1], [0, 1]], # chrX + [[0, 0], [1, 1], [0, 0], [1, 1]], # chrY ], ) @@ -384,3 +428,26 @@ def test_variant_length(self, ds): ds.variant_length, [1, 1, 1, 1, 1, 1], ) + + +@pytest.mark.parametrize( + "prefix", + [ + "tests/data/plink/example", + "tests/data/plink/example_with_fam", + "tests/data/plink/plink_sim_10s_100v_10pmiss", + ], +) +def test_against_plinks_vcf_output(prefix, tmp_path): + vcf_path = prefix + ".vcf" + plink_zarr = tmp_path / "plink.zarr" + vcf_zarr = tmp_path / "vcf.zarr" + plink.convert(prefix, plink_zarr) + vcf.convert([vcf_path], vcf_zarr) + ds1 = sg.load_dataset(plink_zarr) + ds2 = ( + sg.load_dataset(vcf_zarr) + .drop_dims("filters") + .drop_vars(["variant_quality", "variant_PR", "contig_length"]) + ) + xt.assert_equal(ds1, ds2)