Skip to content

Commit 4b9c438

Browse files
authored
Merge branch 'master' into GH656_avoid_anjl_import
2 parents c1919e4 + 5c95d71 commit 4b9c438

File tree

9 files changed

+1145
-781
lines changed

9 files changed

+1145
-781
lines changed

docs/source/Af1.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ SNP data access
7171
is_accessible
7272
biallelic_snp_calls
7373
biallelic_diplotypes
74+
biallelic_snps_to_plink
7475

7576
Haplotype data access
7677
---------------------

docs/source/Ag3.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ SNP data access
7272
is_accessible
7373
biallelic_snp_calls
7474
biallelic_diplotypes
75+
biallelic_snps_to_plink
7576

7677
Haplotype data access
7778
---------------------
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""Parameters for Plink converter functions."""
2+
3+
from typing_extensions import Annotated, TypeAlias
4+
5+
overwrite: TypeAlias = Annotated[
6+
bool,
7+
"""
8+
A boolean indicating whether a previously written file with the same name ought
9+
to be overwritten. Default is False.
10+
""",
11+
]
12+
13+
output_dir: TypeAlias = Annotated[
14+
str,
15+
"""
16+
A string indicating the desired output file location.
17+
""",
18+
]

malariagen_data/anoph/to_plink.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
from typing import Optional
2+
3+
import allel # type: ignore
4+
import numpy as np
5+
import os
6+
import bed_reader
7+
8+
from ..util import dask_compress_dataset
9+
from .snp_data import AnophelesSnpData
10+
from . import base_params
11+
from . import plink_params
12+
from . import pca_params
13+
from numpydoc_decorator import doc # type: ignore
14+
15+
16+
class PlinkConverter(
17+
AnophelesSnpData,
18+
):
19+
def __init__(
20+
self,
21+
**kwargs,
22+
):
23+
# N.B., this class is designed to work cooperatively, and
24+
# so it's important that any remaining parameters are passed
25+
# to the superclass constructor.
26+
super().__init__(**kwargs)
27+
28+
@doc(
29+
summary="""
30+
Write Anopheles biallelic SNP data to the Plink binary file format.
31+
""",
32+
extended_summary="""
33+
This function writes biallelic SNPs to the Plink binary file format. It enables
34+
subsetting to specific regions (`region`), selecting specific sample sets, or lists of
35+
samples, randomly downsampling sites, and specifying filters based on missing data and
36+
minimum minor allele count (see the docs for `biallelic_snp_calls` for more information).
37+
The `overwrite` parameter, set to true, will enable overwrite of data with the same
38+
SNP selection parameter values.
39+
""",
40+
returns="""
41+
Base path to files containing binary Plink output files. Append .bed,
42+
.bim or .fam to obtain paths for the binary genotype table file, variant
43+
information file and sample information file respectively.
44+
""",
45+
notes="""
46+
This computation may take some time to run, depending on your computing
47+
environment. Unless the `overwrite` parameter is set to `True`, results will be returned
48+
from a previous computation, if available.
49+
""",
50+
)
51+
def biallelic_snps_to_plink(
52+
self,
53+
output_dir: plink_params.output_dir,
54+
region: base_params.regions,
55+
n_snps: base_params.n_snps,
56+
overwrite: plink_params.overwrite = False,
57+
thin_offset: base_params.thin_offset = 0,
58+
sample_sets: Optional[base_params.sample_sets] = None,
59+
sample_query: Optional[base_params.sample_query] = None,
60+
sample_indices: Optional[base_params.sample_indices] = None,
61+
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
62+
min_minor_ac: Optional[
63+
base_params.min_minor_ac
64+
] = pca_params.min_minor_ac_default,
65+
max_missing_an: Optional[
66+
base_params.max_missing_an
67+
] = pca_params.max_missing_an_default,
68+
random_seed: base_params.random_seed = 42,
69+
inline_array: base_params.inline_array = base_params.inline_array_default,
70+
chunks: base_params.chunks = base_params.native_chunks,
71+
):
72+
# Define output files
73+
plink_file_path = f"{output_dir}/{region}.{n_snps}.{min_minor_ac}.{max_missing_an}.{thin_offset}"
74+
75+
bed_file_path = f"{plink_file_path}.bed"
76+
77+
# Check to see if file exists and if overwrite is set to false, return existing file
78+
if os.path.exists(bed_file_path):
79+
if not overwrite:
80+
return plink_file_path
81+
82+
# Get snps
83+
ds_snps = self.biallelic_snp_calls(
84+
region=region,
85+
sample_sets=sample_sets,
86+
sample_query=sample_query,
87+
sample_indices=sample_indices,
88+
site_mask=site_mask,
89+
min_minor_ac=min_minor_ac,
90+
max_missing_an=max_missing_an,
91+
n_snps=n_snps,
92+
thin_offset=thin_offset,
93+
random_seed=random_seed,
94+
inline_array=inline_array,
95+
chunks=chunks,
96+
)
97+
98+
# Set up dataset with required vars for plink conversion
99+
100+
# Compute gt ref counts
101+
with self._dask_progress("Computing genotype ref counts"):
102+
gt_asc = ds_snps["call_genotype"].data # dask array
103+
gn_ref = allel.GenotypeDaskArray(gt_asc).to_n_ref(fill=-127)
104+
gn_ref = gn_ref.compute()
105+
106+
# Ensure genotypes vary
107+
loc_var = np.any(gn_ref != gn_ref[:, 0, np.newaxis], axis=1)
108+
109+
# Load final data
110+
ds_snps_final = dask_compress_dataset(ds_snps, loc_var, dim="variants")
111+
112+
# Init vars for input to bed reader
113+
gn_ref_final = gn_ref[loc_var]
114+
val = gn_ref_final.T
115+
with self._spinner("Prepare output data"):
116+
alleles = ds_snps_final["variant_allele"].values
117+
properties = {
118+
"iid": ds_snps_final["sample_id"].values,
119+
"chromosome": ds_snps_final["variant_contig"].values,
120+
"bp_position": ds_snps_final["variant_position"].values,
121+
"allele_1": alleles[:, 0],
122+
"allele_2": alleles[:, 1],
123+
}
124+
125+
bed_reader.to_bed(
126+
filepath=bed_file_path,
127+
val=val,
128+
properties=properties,
129+
count_A1=True,
130+
)
131+
132+
return plink_file_path

malariagen_data/anopheles.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
from .anoph.distance import AnophelesDistanceAnalysis
4747
from .anoph.sample_metadata import AnophelesSampleMetadata, locate_cohorts
4848
from .anoph.snp_data import AnophelesSnpData
49+
from .anoph.to_plink import PlinkConverter
4950
from .anoph.g123 import AnophelesG123Analysis
5051
from .anoph.fst import AnophelesFstAnalysis
5152
from .anoph.h12 import AnophelesH12Analysis
@@ -100,6 +101,7 @@ class AnophelesDataResource(
100101
AnophelesSnpFrequencyAnalysis,
101102
AnophelesDistanceAnalysis,
102103
AnophelesPca,
104+
PlinkConverter,
103105
AnophelesIgv,
104106
AnophelesAimData,
105107
AnophelesHapData,

notebooks/plink_convert.ipynb

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import malariagen_data\n",
10+
"import os \n",
11+
"\n",
12+
"ag3 = malariagen_data.Ag3(pre=True)"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": null,
18+
"metadata": {},
19+
"outputs": [],
20+
"source": [
21+
"ag3.biallelic_snps_to_plink(output_dir=os.getcwd(),\n",
22+
" region='2L:100000-2000000',\n",
23+
" n_snps=2000,\n",
24+
" sample_sets='AG1000G-AO',\n",
25+
" )"
26+
]
27+
}
28+
],
29+
"metadata": {
30+
"kernelspec": {
31+
"display_name": "malariagen_plink",
32+
"language": "python",
33+
"name": "python3"
34+
},
35+
"language_info": {
36+
"codemirror_mode": {
37+
"name": "ipython",
38+
"version": 3
39+
},
40+
"file_extension": ".py",
41+
"mimetype": "text/x-python",
42+
"name": "python",
43+
"nbconvert_exporter": "python",
44+
"pygments_lexer": "ipython3",
45+
"version": "3.12.5"
46+
}
47+
},
48+
"nbformat": 4,
49+
"nbformat_minor": 2
50+
}

0 commit comments

Comments
 (0)