Skip to content

Commit 3664895

Browse files
authored
Merge branch 'master' into 664-freq-functions
2 parents 8c0a432 + ff10e60 commit 3664895

File tree

11 files changed

+207
-137
lines changed

11 files changed

+207
-137
lines changed

.github/workflows/label_issues.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
name: Label issues
2+
on:
3+
issues:
4+
types:
5+
- reopened
6+
- opened
7+
jobs:
8+
label_issues:
9+
runs-on: ubuntu-latest
10+
permissions:
11+
issues: write
12+
steps:
13+
- run: gh issue edit "$NUMBER" --add-label "$LABELS"
14+
env:
15+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
16+
GH_REPO: ${{ github.repository }}
17+
NUMBER: ${{ github.event.issue.number }}
18+
LABELS: triage

malariagen_data/af1.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def __init__(
123123
taxon_colors=TAXON_COLORS,
124124
virtual_contigs=None,
125125
gene_names=None,
126+
inversion_tag_path=None,
126127
)
127128

128129
def __repr__(self):

malariagen_data/ag3.py

Lines changed: 2 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,11 @@
22

33
import dask
44
import pandas as pd # type: ignore
5-
from pandas import CategoricalDtype
6-
import numpy as np # type: ignore
7-
import allel # type: ignore
85
import plotly.express as px # type: ignore
96

107
import malariagen_data
118
from .anopheles import AnophelesDataResource
129

13-
from numpydoc_decorator import doc
14-
from .util import check_types, _karyotype_tags_n_alt
15-
from .anoph import base_params
16-
from typing import Optional, Literal, Annotated, TypeAlias
1710

1811
# silence dask performance warnings
1912
dask.config.set(**{"array.slicing.split_native_chunks": False}) # type: ignore
@@ -35,6 +28,7 @@
3528
GENE_NAMES = {
3629
"AGAP004707": "Vgsc/para",
3730
}
31+
INVERSION_TAG_PATH = "karyotype_tag_snps.csv"
3832

3933

4034
def _setup_aim_palettes():
@@ -83,12 +77,6 @@ def _setup_aim_palettes():
8377
}
8478

8579

86-
inversion_param: TypeAlias = Annotated[
87-
Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
88-
"Name of inversion to infer karyotype for.",
89-
]
90-
91-
9280
class Ag3(AnophelesDataResource):
9381
"""Provides access to data from Ag3.x releases.
9482
@@ -203,6 +191,7 @@ def __init__(
203191
taxon_colors=TAXON_COLORS,
204192
virtual_contigs=VIRTUAL_CONTIGS,
205193
gene_names=GENE_NAMES,
194+
inversion_tag_path=INVERSION_TAG_PATH,
206195
)
207196

208197
# set up caches
@@ -355,82 +344,3 @@ def _results_cache_add_analysis_params(self, params):
355344
super()._results_cache_add_analysis_params(params)
356345
# override parent class to add AIM analysis
357346
params["aim_analysis"] = self._aim_analysis
358-
359-
@check_types
360-
@doc(
361-
summary="Load tag SNPs for a given inversion in Ag.",
362-
)
363-
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
364-
# needs to be modified depending on where we are hosting
365-
import importlib.resources
366-
from . import resources
367-
368-
with importlib.resources.path(resources, "karyotype_tag_snps.csv") as path:
369-
df_tag_snps = pd.read_csv(path, sep=",")
370-
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()
371-
372-
@check_types
373-
@doc(
374-
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
375-
)
376-
def karyotype(
377-
self,
378-
inversion: inversion_param,
379-
sample_sets: Optional[base_params.sample_sets] = None,
380-
sample_query: Optional[base_params.sample_query] = None,
381-
sample_query_options: Optional[base_params.sample_query_options] = None,
382-
) -> pd.DataFrame:
383-
# load tag snp data
384-
df_tagsnps = self.load_inversion_tags(inversion=inversion)
385-
inversion_pos = df_tagsnps["position"]
386-
inversion_alts = df_tagsnps["alt_allele"]
387-
contig = inversion[0:2]
388-
389-
# get snp calls for inversion region
390-
start, end = np.min(inversion_pos), np.max(inversion_pos)
391-
region = f"{contig}:{start}-{end}"
392-
393-
ds_snps = self.snp_calls(
394-
region=region,
395-
sample_sets=sample_sets,
396-
sample_query=sample_query,
397-
sample_query_options=sample_query_options,
398-
)
399-
400-
with self._spinner("Inferring karyotype from tag SNPs"):
401-
# access variables we need
402-
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
403-
pos = allel.SortedIndex(ds_snps["variant_position"].values)
404-
samples = ds_snps["sample_id"].values
405-
alts = ds_snps["variant_allele"].values.astype(str)
406-
407-
# subset to position of inversion tags
408-
mask = pos.locate_intersection(inversion_pos)[0]
409-
alts = alts[mask]
410-
geno = geno.compress(mask, axis=0).compute()
411-
412-
# infer karyotype
413-
gn_alt = _karyotype_tags_n_alt(
414-
gt=geno, alts=alts, inversion_alts=inversion_alts
415-
)
416-
is_called = geno.is_called()
417-
418-
# calculate mean genotype for each sample whilst masking missing calls
419-
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
420-
total_sites = np.sum(is_called, axis=0)
421-
422-
df = pd.DataFrame(
423-
{
424-
"sample_id": samples,
425-
"inversion": inversion,
426-
f"karyotype_{inversion}_mean": av_gts,
427-
# round the genotypes then convert to int
428-
f"karyotype_{inversion}": av_gts.round().astype(int),
429-
"total_tag_snps": total_sites,
430-
},
431-
)
432-
# Allow filling missing values with "<NA>" visible placeholder.
433-
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
434-
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)
435-
436-
return df

malariagen_data/anoph/distance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ def plot_njt(
421421
height: plotly_params.fig_height = 600,
422422
show: plotly_params.show = True,
423423
renderer: plotly_params.renderer = None,
424-
render_mode: plotly_params.render_mode = "auto",
424+
render_mode: plotly_params.render_mode = "svg",
425425
title: plotly_params.title = True,
426426
title_font_size: plotly_params.title_font_size = 14,
427427
line_width: plotly_params.line_width = 0.5,

malariagen_data/anoph/karyotype.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import pandas as pd # type: ignore
2+
from pandas import CategoricalDtype
3+
import numpy as np # type: ignore
4+
import allel # type: ignore
5+
6+
from numpydoc_decorator import doc
7+
from ..util import check_types
8+
from . import base_params
9+
from typing import Optional
10+
11+
from .snp_data import AnophelesSnpData
12+
from .karyotype_params import inversion_param
13+
14+
15+
def _karyotype_tags_n_alt(gt, alts, inversion_alts):
16+
# could be Numba'd for speed but was already quick (not many inversion tag snps)
17+
n_sites = gt.shape[0]
18+
n_samples = gt.shape[1]
19+
20+
# create empty array
21+
inv_n_alt = np.empty((n_sites, n_samples), dtype=np.int8)
22+
23+
# for every site
24+
for i in range(n_sites):
25+
# find the index of the correct tag snp allele
26+
tagsnp_index = np.where(alts[i] == inversion_alts[i])[0]
27+
28+
for j in range(n_samples):
29+
# count alleles which == tag snp allele and store
30+
n_tag_alleles = np.sum(gt[i, j] == tagsnp_index[0])
31+
inv_n_alt[i, j] = n_tag_alleles
32+
33+
return inv_n_alt
34+
35+
36+
class AnophelesKaryotypeAnalysis(AnophelesSnpData):
37+
def __init__(
38+
self,
39+
inversion_tag_path: Optional[str] = None,
40+
**kwargs,
41+
):
42+
# N.B., this class is designed to work cooperatively, and
43+
# so it's important that any remaining parameters are passed
44+
# to the superclass constructor.
45+
super().__init__(**kwargs)
46+
47+
self._inversion_tag_path = inversion_tag_path
48+
49+
@check_types
50+
@doc(
51+
summary="Load tag SNPs for a given inversion.",
52+
)
53+
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
54+
# needs to be modified depending on where we are hosting
55+
import importlib.resources
56+
from .. import resources
57+
58+
if self._inversion_tag_path is None:
59+
raise NotImplementedError(
60+
"No inversion tags are available for this data resource."
61+
)
62+
else:
63+
with importlib.resources.path(resources, self._inversion_tag_path) as path:
64+
df_tag_snps = pd.read_csv(path, sep=",")
65+
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()
66+
67+
@check_types
68+
@doc(
69+
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
70+
)
71+
def karyotype(
72+
self,
73+
inversion: inversion_param,
74+
sample_sets: Optional[base_params.sample_sets] = None,
75+
sample_query: Optional[base_params.sample_query] = None,
76+
sample_query_options: Optional[base_params.sample_query_options] = None,
77+
) -> pd.DataFrame:
78+
# load tag snp data
79+
df_tagsnps = self.load_inversion_tags(inversion=inversion)
80+
inversion_pos = df_tagsnps["position"]
81+
inversion_alts = df_tagsnps["alt_allele"]
82+
contig = inversion[0:2]
83+
84+
# get snp calls for inversion region
85+
start, end = np.min(inversion_pos), np.max(inversion_pos)
86+
region = f"{contig}:{start}-{end}"
87+
88+
ds_snps = self.snp_calls(
89+
region=region,
90+
sample_sets=sample_sets,
91+
sample_query=sample_query,
92+
sample_query_options=sample_query_options,
93+
)
94+
95+
with self._spinner("Inferring karyotype from tag SNPs"):
96+
# access variables we need
97+
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
98+
pos = allel.SortedIndex(ds_snps["variant_position"].values)
99+
samples = ds_snps["sample_id"].values
100+
alts = ds_snps["variant_allele"].values.astype(str)
101+
102+
# subset to position of inversion tags
103+
mask = pos.locate_intersection(inversion_pos)[0]
104+
alts = alts[mask]
105+
geno = geno.compress(mask, axis=0).compute()
106+
107+
# infer karyotype
108+
gn_alt = _karyotype_tags_n_alt(
109+
gt=geno, alts=alts, inversion_alts=inversion_alts
110+
)
111+
is_called = geno.is_called()
112+
113+
# calculate mean genotype for each sample whilst masking missing calls
114+
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
115+
total_sites = np.sum(is_called, axis=0)
116+
117+
df = pd.DataFrame(
118+
{
119+
"sample_id": samples,
120+
"inversion": inversion,
121+
f"karyotype_{inversion}_mean": av_gts,
122+
# round the genotypes then convert to int
123+
f"karyotype_{inversion}": av_gts.round().astype(int),
124+
"total_tag_snps": total_sites,
125+
},
126+
)
127+
# Allow filling missing values with "<NA>" visible placeholder.
128+
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
129+
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)
130+
131+
return df
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
"""Parameter definitions for karyotype analysis functions."""
2+
3+
4+
from typing_extensions import Annotated, TypeAlias
5+
6+
inversion_param: TypeAlias = Annotated[
7+
str,
8+
"Name of inversion to infer karyotype for.",
9+
]

malariagen_data/anopheles.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
plotly_params,
3030
xpehh_params,
3131
)
32+
from .anoph.karyotype import AnophelesKaryotypeAnalysis
3233
from .anoph.aim_data import AnophelesAimData
3334
from .anoph.base import AnophelesBase
3435
from .anoph.cnv_data import AnophelesCnvData
@@ -94,6 +95,7 @@ class AnophelesDataResource(
9495
AnophelesPca,
9596
PlinkConverter,
9697
AnophelesIgv,
98+
AnophelesKaryotypeAnalysis,
9799
AnophelesAimData,
98100
AnophelesHapData,
99101
AnophelesSnpData,
@@ -138,6 +140,7 @@ def __init__(
138140
taxon_colors: Optional[Mapping[str, str]],
139141
virtual_contigs: Optional[Mapping[str, Sequence[str]]],
140142
gene_names: Optional[Mapping[str, str]],
143+
inversion_tag_path: Optional[str],
141144
):
142145
super().__init__(
143146
url=url,
@@ -171,6 +174,7 @@ def __init__(
171174
taxon_colors=taxon_colors,
172175
virtual_contigs=virtual_contigs,
173176
gene_names=gene_names,
177+
inversion_tag_path=inversion_tag_path,
174178
)
175179

176180
@property

malariagen_data/util.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1591,27 +1591,6 @@ def distributed_client():
15911591
return client
15921592

15931593

1594-
def _karyotype_tags_n_alt(gt, alts, inversion_alts):
1595-
# could be Numba'd for speed but was already quick (not many inversion tag snps)
1596-
n_sites = gt.shape[0]
1597-
n_samples = gt.shape[1]
1598-
1599-
# create empty array
1600-
inv_n_alt = np.empty((n_sites, n_samples), dtype=np.int8)
1601-
1602-
# for every site
1603-
for i in range(n_sites):
1604-
# find the index of the correct tag snp allele
1605-
tagsnp_index = np.where(alts[i] == inversion_alts[i])[0]
1606-
1607-
for j in range(n_samples):
1608-
# count alleles which == tag snp allele and store
1609-
n_tag_alleles = np.sum(gt[i, j] == tagsnp_index[0])
1610-
inv_n_alt[i, j] = n_tag_alleles
1611-
1612-
return inv_n_alt
1613-
1614-
16151594
def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
16161595
# Take a copy, as we will modify the dataframe.
16171596
df_samples = df_samples.copy()

notebooks/karyotype.ipynb

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -359,14 +359,6 @@
359359
"source": [
360360
"ag3.plot_pca_coords(pca_df_2rc_col, color=\"karyotype_2Rc_col\", symbol=\"country\", width=600, height=500)"
361361
]
362-
},
363-
{
364-
"cell_type": "code",
365-
"execution_count": null,
366-
"id": "d6fb7237-bb4e-490e-9ae0-33a52d4fa650",
367-
"metadata": {},
368-
"outputs": [],
369-
"source": []
370362
}
371363
],
372364
"metadata": {
@@ -391,7 +383,7 @@
391383
"name": "python",
392384
"nbconvert_exporter": "python",
393385
"pygments_lexer": "ipython3",
394-
"version": "3.10.12"
386+
"version": "3.10.11"
395387
}
396388
},
397389
"nbformat": 4,

0 commit comments

Comments
 (0)