|
| 1 | +from typing import Optional |
| 2 | + |
| 3 | +import allel # type: ignore |
| 4 | +import numpy as np |
| 5 | +import os |
| 6 | +import bed_reader |
| 7 | + |
| 8 | +from ..util import dask_compress_dataset |
| 9 | +from .snp_data import AnophelesSnpData |
| 10 | +from . import base_params |
| 11 | +from . import plink_params |
| 12 | +from . import pca_params |
| 13 | +from numpydoc_decorator import doc # type: ignore |
| 14 | + |
| 15 | + |
| 16 | +class PlinkConverter( |
| 17 | + AnophelesSnpData, |
| 18 | +): |
| 19 | + def __init__( |
| 20 | + self, |
| 21 | + **kwargs, |
| 22 | + ): |
| 23 | + # N.B., this class is designed to work cooperatively, and |
| 24 | + # so it's important that any remaining parameters are passed |
| 25 | + # to the superclass constructor. |
| 26 | + super().__init__(**kwargs) |
| 27 | + |
| 28 | + @doc( |
| 29 | + summary=""" |
| 30 | + Write Anopheles biallelic SNP data to the Plink binary file format. |
| 31 | + """, |
| 32 | + extended_summary=""" |
| 33 | + This function writes biallelic SNPs to the Plink binary file format. It enables |
| 34 | + subsetting to specific regions (`region`), selecting specific sample sets, or lists of |
| 35 | + samples, randomly downsampling sites, and specifying filters based on missing data and |
| 36 | + minimum minor allele count (see the docs for `biallelic_snp_calls` for more information). |
| 37 | + The `overwrite` parameter, set to true, will enable overwrite of data with the same |
| 38 | + SNP selection parameter values. |
| 39 | + """, |
| 40 | + returns=""" |
| 41 | + Base path to files containing binary Plink output files. Append .bed, |
| 42 | + .bim or .fam to obtain paths for the binary genotype table file, variant |
| 43 | + information file and sample information file respectively. |
| 44 | + """, |
| 45 | + notes=""" |
| 46 | + This computation may take some time to run, depending on your computing |
| 47 | + environment. Unless the `overwrite` parameter is set to `True`, results will be returned |
| 48 | + from a previous computation, if available. |
| 49 | + """, |
| 50 | + ) |
| 51 | + def biallelic_snps_to_plink( |
| 52 | + self, |
| 53 | + output_dir: plink_params.output_dir, |
| 54 | + region: base_params.regions, |
| 55 | + n_snps: base_params.n_snps, |
| 56 | + overwrite: plink_params.overwrite = False, |
| 57 | + thin_offset: base_params.thin_offset = 0, |
| 58 | + sample_sets: Optional[base_params.sample_sets] = None, |
| 59 | + sample_query: Optional[base_params.sample_query] = None, |
| 60 | + sample_indices: Optional[base_params.sample_indices] = None, |
| 61 | + site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, |
| 62 | + min_minor_ac: Optional[ |
| 63 | + base_params.min_minor_ac |
| 64 | + ] = pca_params.min_minor_ac_default, |
| 65 | + max_missing_an: Optional[ |
| 66 | + base_params.max_missing_an |
| 67 | + ] = pca_params.max_missing_an_default, |
| 68 | + random_seed: base_params.random_seed = 42, |
| 69 | + inline_array: base_params.inline_array = base_params.inline_array_default, |
| 70 | + chunks: base_params.chunks = base_params.native_chunks, |
| 71 | + ): |
| 72 | + # Define output files |
| 73 | + plink_file_path = f"{output_dir}/{region}.{n_snps}.{min_minor_ac}.{max_missing_an}.{thin_offset}" |
| 74 | + |
| 75 | + bed_file_path = f"{plink_file_path}.bed" |
| 76 | + |
| 77 | + # Check to see if file exists and if overwrite is set to false, return existing file |
| 78 | + if os.path.exists(bed_file_path): |
| 79 | + if not overwrite: |
| 80 | + return plink_file_path |
| 81 | + |
| 82 | + # Get snps |
| 83 | + ds_snps = self.biallelic_snp_calls( |
| 84 | + region=region, |
| 85 | + sample_sets=sample_sets, |
| 86 | + sample_query=sample_query, |
| 87 | + sample_indices=sample_indices, |
| 88 | + site_mask=site_mask, |
| 89 | + min_minor_ac=min_minor_ac, |
| 90 | + max_missing_an=max_missing_an, |
| 91 | + n_snps=n_snps, |
| 92 | + thin_offset=thin_offset, |
| 93 | + random_seed=random_seed, |
| 94 | + inline_array=inline_array, |
| 95 | + chunks=chunks, |
| 96 | + ) |
| 97 | + |
| 98 | + # Set up dataset with required vars for plink conversion |
| 99 | + |
| 100 | + # Compute gt ref counts |
| 101 | + with self._dask_progress("Computing genotype ref counts"): |
| 102 | + gt_asc = ds_snps["call_genotype"].data # dask array |
| 103 | + gn_ref = allel.GenotypeDaskArray(gt_asc).to_n_ref(fill=-127) |
| 104 | + gn_ref = gn_ref.compute() |
| 105 | + |
| 106 | + # Ensure genotypes vary |
| 107 | + loc_var = np.any(gn_ref != gn_ref[:, 0, np.newaxis], axis=1) |
| 108 | + |
| 109 | + # Load final data |
| 110 | + ds_snps_final = dask_compress_dataset(ds_snps, loc_var, dim="variants") |
| 111 | + |
| 112 | + # Init vars for input to bed reader |
| 113 | + gn_ref_final = gn_ref[loc_var] |
| 114 | + val = gn_ref_final.T |
| 115 | + with self._spinner("Prepare output data"): |
| 116 | + alleles = ds_snps_final["variant_allele"].values |
| 117 | + properties = { |
| 118 | + "iid": ds_snps_final["sample_id"].values, |
| 119 | + "chromosome": ds_snps_final["variant_contig"].values, |
| 120 | + "bp_position": ds_snps_final["variant_position"].values, |
| 121 | + "allele_1": alleles[:, 0], |
| 122 | + "allele_2": alleles[:, 1], |
| 123 | + } |
| 124 | + |
| 125 | + bed_reader.to_bed( |
| 126 | + filepath=bed_file_path, |
| 127 | + val=val, |
| 128 | + properties=properties, |
| 129 | + count_A1=True, |
| 130 | + ) |
| 131 | + |
| 132 | + return plink_file_path |
0 commit comments