Skip to content

Commit cdda0a2

Browse files
committed
Moved karyotype to anoph
1 parent 70ad53d commit cdda0a2

File tree

4 files changed

+119
-86
lines changed

4 files changed

+119
-86
lines changed

malariagen_data/ag3.py

Lines changed: 1 addition & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,12 @@
22

33
import dask
44
import pandas as pd # type: ignore
5-
from pandas import CategoricalDtype
6-
import numpy as np # type: ignore
7-
import allel # type: ignore
85
import plotly.express as px # type: ignore
96

107
import malariagen_data
118
from .anopheles import AnophelesDataResource
129

13-
from numpydoc_decorator import doc
14-
from .util import check_types, _karyotype_tags_n_alt
15-
from .anoph import base_params
16-
from typing import Optional, Literal, Annotated, TypeAlias
10+
from typing import Literal, Annotated, TypeAlias
1711

1812
# silence dask performance warnings
1913
dask.config.set(**{"array.slicing.split_native_chunks": False}) # type: ignore
@@ -355,82 +349,3 @@ def _results_cache_add_analysis_params(self, params):
355349
super()._results_cache_add_analysis_params(params)
356350
# override parent class to add AIM analysis
357351
params["aim_analysis"] = self._aim_analysis
358-
359-
@check_types
360-
@doc(
361-
summary="Load tag SNPs for a given inversion in Ag.",
362-
)
363-
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
364-
# needs to be modified depending on where we are hosting
365-
import importlib.resources
366-
from . import resources
367-
368-
with importlib.resources.path(resources, "karyotype_tag_snps.csv") as path:
369-
df_tag_snps = pd.read_csv(path, sep=",")
370-
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()
371-
372-
@check_types
373-
@doc(
374-
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
375-
)
376-
def karyotype(
377-
self,
378-
inversion: inversion_param,
379-
sample_sets: Optional[base_params.sample_sets] = None,
380-
sample_query: Optional[base_params.sample_query] = None,
381-
sample_query_options: Optional[base_params.sample_query_options] = None,
382-
) -> pd.DataFrame:
383-
# load tag snp data
384-
df_tagsnps = self.load_inversion_tags(inversion=inversion)
385-
inversion_pos = df_tagsnps["position"]
386-
inversion_alts = df_tagsnps["alt_allele"]
387-
contig = inversion[0:2]
388-
389-
# get snp calls for inversion region
390-
start, end = np.min(inversion_pos), np.max(inversion_pos)
391-
region = f"{contig}:{start}-{end}"
392-
393-
ds_snps = self.snp_calls(
394-
region=region,
395-
sample_sets=sample_sets,
396-
sample_query=sample_query,
397-
sample_query_options=sample_query_options,
398-
)
399-
400-
with self._spinner("Inferring karyotype from tag SNPs"):
401-
# access variables we need
402-
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
403-
pos = allel.SortedIndex(ds_snps["variant_position"].values)
404-
samples = ds_snps["sample_id"].values
405-
alts = ds_snps["variant_allele"].values.astype(str)
406-
407-
# subset to position of inversion tags
408-
mask = pos.locate_intersection(inversion_pos)[0]
409-
alts = alts[mask]
410-
geno = geno.compress(mask, axis=0).compute()
411-
412-
# infer karyotype
413-
gn_alt = _karyotype_tags_n_alt(
414-
gt=geno, alts=alts, inversion_alts=inversion_alts
415-
)
416-
is_called = geno.is_called()
417-
418-
# calculate mean genotype for each sample whilst masking missing calls
419-
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
420-
total_sites = np.sum(is_called, axis=0)
421-
422-
df = pd.DataFrame(
423-
{
424-
"sample_id": samples,
425-
"inversion": inversion,
426-
f"karyotype_{inversion}_mean": av_gts,
427-
# round the genotypes then convert to int
428-
f"karyotype_{inversion}": av_gts.round().astype(int),
429-
"total_tag_snps": total_sites,
430-
},
431-
)
432-
# Allow filling missing values with "<NA>" visible placeholder.
433-
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
434-
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)
435-
436-
return df

malariagen_data/anoph/karyotype.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import pandas as pd # type: ignore
2+
from pandas import CategoricalDtype
3+
import numpy as np # type: ignore
4+
import allel # type: ignore
5+
6+
from numpydoc_decorator import doc
7+
from ..util import check_types, _karyotype_tags_n_alt
8+
from . import base_params
9+
from typing import Optional
10+
11+
from .genome_features import AnophelesGenomeFeaturesData
12+
from .genome_sequence import AnophelesGenomeSequenceData
13+
from .sample_metadata import AnophelesSampleMetadata
14+
from .karyotype_params import inversion_param
15+
16+
17+
class AnophelesKaryotypeData(
18+
AnophelesSampleMetadata, AnophelesGenomeFeaturesData, AnophelesGenomeSequenceData
19+
):
20+
def __init__(
21+
self,
22+
**kwargs,
23+
):
24+
# N.B., this class is designed to work cooperatively, and
25+
# so it's important that any remaining parameters are passed
26+
# to the superclass constructor.
27+
super().__init__(**kwargs)
28+
29+
@check_types
30+
@doc(
31+
summary="Load tag SNPs for a given inversion in Ag.",
32+
)
33+
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
34+
# needs to be modified depending on where we are hosting
35+
import importlib.resources
36+
from .. import resources
37+
38+
with importlib.resources.path(resources, "karyotype_tag_snps.csv") as path:
39+
df_tag_snps = pd.read_csv(path, sep=",")
40+
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()
41+
42+
@check_types
43+
@doc(
44+
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
45+
)
46+
def karyotype(
47+
self,
48+
inversion: inversion_param,
49+
sample_sets: Optional[base_params.sample_sets] = None,
50+
sample_query: Optional[base_params.sample_query] = None,
51+
sample_query_options: Optional[base_params.sample_query_options] = None,
52+
) -> pd.DataFrame:
53+
# load tag snp data
54+
df_tagsnps = self.load_inversion_tags(inversion=inversion)
55+
inversion_pos = df_tagsnps["position"]
56+
inversion_alts = df_tagsnps["alt_allele"]
57+
contig = inversion[0:2]
58+
59+
# get snp calls for inversion region
60+
start, end = np.min(inversion_pos), np.max(inversion_pos)
61+
region = f"{contig}:{start}-{end}"
62+
63+
ds_snps = self.snp_calls(
64+
region=region,
65+
sample_sets=sample_sets,
66+
sample_query=sample_query,
67+
sample_query_options=sample_query_options,
68+
)
69+
70+
with self._spinner("Inferring karyotype from tag SNPs"):
71+
# access variables we need
72+
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
73+
pos = allel.SortedIndex(ds_snps["variant_position"].values)
74+
samples = ds_snps["sample_id"].values
75+
alts = ds_snps["variant_allele"].values.astype(str)
76+
77+
# subset to position of inversion tags
78+
mask = pos.locate_intersection(inversion_pos)[0]
79+
alts = alts[mask]
80+
geno = geno.compress(mask, axis=0).compute()
81+
82+
# infer karyotype
83+
gn_alt = _karyotype_tags_n_alt(
84+
gt=geno, alts=alts, inversion_alts=inversion_alts
85+
)
86+
is_called = geno.is_called()
87+
88+
# calculate mean genotype for each sample whilst masking missing calls
89+
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
90+
total_sites = np.sum(is_called, axis=0)
91+
92+
df = pd.DataFrame(
93+
{
94+
"sample_id": samples,
95+
"inversion": inversion,
96+
f"karyotype_{inversion}_mean": av_gts,
97+
# round the genotypes then convert to int
98+
f"karyotype_{inversion}": av_gts.round().astype(int),
99+
"total_tag_snps": total_sites,
100+
},
101+
)
102+
# Allow filling missing values with "<NA>" visible placeholder.
103+
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
104+
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)
105+
106+
return df
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""Parameter definitions for karyotype analysis functions."""
2+
3+
from typing import Literal
4+
5+
from typing_extensions import Annotated, TypeAlias
6+
7+
inversion_param: TypeAlias = Annotated[
8+
Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
9+
"Name of inversion to infer karyotype for.",
10+
]

malariagen_data/anopheles.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
plotly_params,
3030
xpehh_params,
3131
)
32+
from .anoph.karyotype import AnophelesKaryotypeData
3233
from .anoph.aim_data import AnophelesAimData
3334
from .anoph.base import AnophelesBase
3435
from .anoph.cnv_data import AnophelesCnvData
@@ -94,6 +95,7 @@ class AnophelesDataResource(
9495
AnophelesPca,
9596
PlinkConverter,
9697
AnophelesIgv,
98+
AnophelesKaryotypeData,
9799
AnophelesAimData,
98100
AnophelesHapData,
99101
AnophelesSnpData,

0 commit comments

Comments
 (0)