Skip to content

Commit db100fc

Browse files
authored
Merge branch 'master' into GH407_allow_gene_labels
2 parents 3be2173 + 6cd964b commit db100fc

18 files changed

+1714
-801
lines changed

docs/source/Af1.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ SNP data access
7171
is_accessible
7272
biallelic_snp_calls
7373
biallelic_diplotypes
74+
biallelic_snps_to_plink
7475

7576
Haplotype data access
7677
---------------------

docs/source/Ag3.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ SNP data access
7272
is_accessible
7373
biallelic_snp_calls
7474
biallelic_diplotypes
75+
biallelic_snps_to_plink
7576

7677
Haplotype data access
7778
---------------------

malariagen_data/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from .amin1 import Amin1
55
from .anopheles import AnophelesDataResource, Region
66
from .pf7 import Pf7
7+
from .pf8 import Pf8
78
from .pv4 import Pv4
89
from .util import SiteClass
910

malariagen_data/anoph/distance.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import numba # type: ignore
77
import numpy as np
88
from numpydoc_decorator import doc # type: ignore
9-
import anjl.params # type: ignore
109

1110
# Internal imports.
1211
from .snp_data import AnophelesSnpData
@@ -410,10 +409,10 @@ def plot_njt(
410409
metric: distance_params.distance_metric = distance_params.default_distance_metric,
411410
distance_sort: Optional[tree_params.distance_sort] = None,
412411
count_sort: Optional[tree_params.count_sort] = None,
413-
center_x: anjl.params.center_x = 0,
414-
center_y: anjl.params.center_y = 0,
415-
arc_start: anjl.params.arc_start = 0,
416-
arc_stop: anjl.params.arc_stop = 2 * math.pi,
412+
center_x: distance_params.center_x = 0,
413+
center_y: distance_params.center_y = 0,
414+
arc_start: distance_params.arc_start = 0,
415+
arc_stop: distance_params.arc_stop = 2 * math.pi,
417416
width: plotly_params.fig_width = 800,
418417
height: plotly_params.fig_height = 600,
419418
show: plotly_params.show = True,
@@ -426,8 +425,8 @@ def plot_njt(
426425
color_discrete_sequence: plotly_params.color_discrete_sequence = None,
427426
color_discrete_map: plotly_params.color_discrete_map = None,
428427
category_orders: plotly_params.category_order = None,
429-
edge_legend: anjl.params.edge_legend = False,
430-
leaf_legend: anjl.params.leaf_legend = True,
428+
edge_legend: distance_params.edge_legend = False,
429+
leaf_legend: distance_params.leaf_legend = True,
431430
legend_sizing: plotly_params.legend_sizing = "constant",
432431
thin_offset: base_params.thin_offset = 0,
433432
sample_sets: Optional[base_params.sample_sets] = None,
@@ -449,6 +448,10 @@ def plot_njt(
449448
inline_array: base_params.inline_array = base_params.inline_array_default,
450449
chunks: base_params.chunks = base_params.native_chunks,
451450
) -> plotly_params.figure:
451+
# Only import anjl if needed, as it requires a couple of seconds to compile
452+
# functions.
453+
import anjl # type: ignore
454+
452455
# Normalise params.
453456
if count_sort is None and distance_sort is None:
454457
count_sort = True

malariagen_data/anoph/distance_params.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,20 @@
1919
]
2020

2121
default_nj_algorithm: nj_algorithm = "dynamic"
22+
23+
center_x: TypeAlias = Annotated[int | float, "X coordinate where plotting is centered."]
24+
25+
center_y: TypeAlias = Annotated[int | float, "Y coordinate where plotting is centered."]
26+
27+
arc_start: TypeAlias = Annotated[int | float, "Angle where tree layout begins."]
28+
29+
arc_stop: TypeAlias = Annotated[int | float, "Angle where tree layout ends."]
30+
31+
edge_legend: TypeAlias = Annotated[
32+
bool, "Show legend entries for the different edge (line) colors."
33+
]
34+
35+
leaf_legend: TypeAlias = Annotated[
36+
bool,
37+
"Show legend entries for the different leaf node (scatter) colors and symbols.",
38+
]
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""Parameters for Plink converter functions."""
2+
3+
from typing_extensions import Annotated, TypeAlias
4+
5+
overwrite: TypeAlias = Annotated[
6+
bool,
7+
"""
8+
A boolean indicating whether a previously written file with the same name ought
9+
to be overwritten. Default is False.
10+
""",
11+
]
12+
13+
output_dir: TypeAlias = Annotated[
14+
str,
15+
"""
16+
A string indicating the desired output file location.
17+
""",
18+
]

malariagen_data/anoph/to_plink.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
from typing import Optional
2+
3+
import allel # type: ignore
4+
import numpy as np
5+
import os
6+
import bed_reader
7+
8+
from ..util import dask_compress_dataset
9+
from .snp_data import AnophelesSnpData
10+
from . import base_params
11+
from . import plink_params
12+
from . import pca_params
13+
from numpydoc_decorator import doc # type: ignore
14+
15+
16+
class PlinkConverter(
17+
AnophelesSnpData,
18+
):
19+
def __init__(
20+
self,
21+
**kwargs,
22+
):
23+
# N.B., this class is designed to work cooperatively, and
24+
# so it's important that any remaining parameters are passed
25+
# to the superclass constructor.
26+
super().__init__(**kwargs)
27+
28+
@doc(
29+
summary="""
30+
Write Anopheles biallelic SNP data to the Plink binary file format.
31+
""",
32+
extended_summary="""
33+
This function writes biallelic SNPs to the Plink binary file format. It enables
34+
subsetting to specific regions (`region`), selecting specific sample sets, or lists of
35+
samples, randomly downsampling sites, and specifying filters based on missing data and
36+
minimum minor allele count (see the docs for `biallelic_snp_calls` for more information).
37+
The `overwrite` parameter, set to true, will enable overwrite of data with the same
38+
SNP selection parameter values.
39+
""",
40+
returns="""
41+
Base path to files containing binary Plink output files. Append .bed,
42+
.bim or .fam to obtain paths for the binary genotype table file, variant
43+
information file and sample information file respectively.
44+
""",
45+
notes="""
46+
This computation may take some time to run, depending on your computing
47+
environment. Unless the `overwrite` parameter is set to `True`, results will be returned
48+
from a previous computation, if available.
49+
""",
50+
)
51+
def biallelic_snps_to_plink(
52+
self,
53+
output_dir: plink_params.output_dir,
54+
region: base_params.regions,
55+
n_snps: base_params.n_snps,
56+
overwrite: plink_params.overwrite = False,
57+
thin_offset: base_params.thin_offset = 0,
58+
sample_sets: Optional[base_params.sample_sets] = None,
59+
sample_query: Optional[base_params.sample_query] = None,
60+
sample_indices: Optional[base_params.sample_indices] = None,
61+
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
62+
min_minor_ac: Optional[
63+
base_params.min_minor_ac
64+
] = pca_params.min_minor_ac_default,
65+
max_missing_an: Optional[
66+
base_params.max_missing_an
67+
] = pca_params.max_missing_an_default,
68+
random_seed: base_params.random_seed = 42,
69+
inline_array: base_params.inline_array = base_params.inline_array_default,
70+
chunks: base_params.chunks = base_params.native_chunks,
71+
):
72+
# Define output files
73+
plink_file_path = f"{output_dir}/{region}.{n_snps}.{min_minor_ac}.{max_missing_an}.{thin_offset}"
74+
75+
bed_file_path = f"{plink_file_path}.bed"
76+
77+
# Check to see if file exists and if overwrite is set to false, return existing file
78+
if os.path.exists(bed_file_path):
79+
if not overwrite:
80+
return plink_file_path
81+
82+
# Get snps
83+
ds_snps = self.biallelic_snp_calls(
84+
region=region,
85+
sample_sets=sample_sets,
86+
sample_query=sample_query,
87+
sample_indices=sample_indices,
88+
site_mask=site_mask,
89+
min_minor_ac=min_minor_ac,
90+
max_missing_an=max_missing_an,
91+
n_snps=n_snps,
92+
thin_offset=thin_offset,
93+
random_seed=random_seed,
94+
inline_array=inline_array,
95+
chunks=chunks,
96+
)
97+
98+
# Set up dataset with required vars for plink conversion
99+
100+
# Compute gt ref counts
101+
with self._dask_progress("Computing genotype ref counts"):
102+
gt_asc = ds_snps["call_genotype"].data # dask array
103+
gn_ref = allel.GenotypeDaskArray(gt_asc).to_n_ref(fill=-127)
104+
gn_ref = gn_ref.compute()
105+
106+
# Ensure genotypes vary
107+
loc_var = np.any(gn_ref != gn_ref[:, 0, np.newaxis], axis=1)
108+
109+
# Load final data
110+
ds_snps_final = dask_compress_dataset(ds_snps, loc_var, dim="variants")
111+
112+
# Init vars for input to bed reader
113+
gn_ref_final = gn_ref[loc_var]
114+
val = gn_ref_final.T
115+
with self._spinner("Prepare output data"):
116+
alleles = ds_snps_final["variant_allele"].values
117+
properties = {
118+
"iid": ds_snps_final["sample_id"].values,
119+
"chromosome": ds_snps_final["variant_contig"].values,
120+
"bp_position": ds_snps_final["variant_position"].values,
121+
"allele_1": alleles[:, 0],
122+
"allele_2": alleles[:, 1],
123+
}
124+
125+
bed_reader.to_bed(
126+
filepath=bed_file_path,
127+
val=val,
128+
properties=properties,
129+
count_A1=True,
130+
)
131+
132+
return plink_file_path

malariagen_data/anopheles.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
from .anoph.distance import AnophelesDistanceAnalysis
4747
from .anoph.sample_metadata import AnophelesSampleMetadata, locate_cohorts
4848
from .anoph.snp_data import AnophelesSnpData
49+
from .anoph.to_plink import PlinkConverter
4950
from .anoph.g123 import AnophelesG123Analysis
5051
from .anoph.fst import AnophelesFstAnalysis
5152
from .anoph.h12 import AnophelesH12Analysis
@@ -100,6 +101,7 @@ class AnophelesDataResource(
100101
AnophelesSnpFrequencyAnalysis,
101102
AnophelesDistanceAnalysis,
102103
AnophelesPca,
104+
PlinkConverter,
103105
AnophelesIgv,
104106
AnophelesAimData,
105107
AnophelesHapData,

malariagen_data/pf8.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import os
2+
3+
from .plasmodium import PlasmodiumDataResource
4+
5+
6+
class Pf8(PlasmodiumDataResource):
7+
"""Provides access to data from the Pf8 release.
8+
9+
Parameters
10+
----------
11+
url : str, optional
12+
Base path to data. Default uses Google Cloud Storage "gs://pf8-release/",
13+
or specify a local path on your file system if data have been downloaded.
14+
data_config : str, optional
15+
Path to config for structure of Pf8 data resource. Defaults to config included
16+
with the malariagen_data package.
17+
**kwargs
18+
Passed through to fsspec when setting up file system access.
19+
20+
Examples
21+
--------
22+
Access data from Google Cloud Storage (default):
23+
24+
>>> import malariagen_data
25+
>>> pf8 = malariagen_data.Pf8()
26+
27+
Access data downloaded to a local file system:
28+
29+
>>> pf8 = malariagen_data.Pf8("/local/path/to/pf8-release/")
30+
31+
"""
32+
33+
def __init__(
34+
self,
35+
url=None,
36+
data_config=None,
37+
**kwargs,
38+
):
39+
# setup filesystem
40+
if not data_config:
41+
working_dir = os.path.dirname(os.path.abspath(__file__))
42+
data_config = os.path.join(working_dir, "pf8_config.json")
43+
super().__init__(data_config=data_config, url=url)

0 commit comments

Comments
 (0)