|
2 | 2 |
|
3 | 3 | import dask
|
4 | 4 | import pandas as pd # type: ignore
|
5 |
| -from pandas import CategoricalDtype |
6 |
| -import numpy as np # type: ignore |
7 |
| -import allel # type: ignore |
8 | 5 | import plotly.express as px # type: ignore
|
9 | 6 |
|
10 | 7 | import malariagen_data
|
11 | 8 | from .anopheles import AnophelesDataResource
|
12 | 9 |
|
13 |
| -from numpydoc_decorator import doc |
14 |
| -from .util import check_types, _karyotype_tags_n_alt |
15 |
| -from .anoph import base_params |
16 |
| -from typing import Optional, Literal, Annotated, TypeAlias |
17 | 10 |
|
18 | 11 | # silence dask performance warnings
|
19 | 12 | dask.config.set(**{"array.slicing.split_native_chunks": False}) # type: ignore
|
|
35 | 28 | GENE_NAMES = {
|
36 | 29 | "AGAP004707": "Vgsc/para",
|
37 | 30 | }
|
| 31 | +INVERSION_TAG_PATH = "karyotype_tag_snps.csv" |
38 | 32 |
|
39 | 33 |
|
40 | 34 | def _setup_aim_palettes():
|
@@ -83,12 +77,6 @@ def _setup_aim_palettes():
|
83 | 77 | }
|
84 | 78 |
|
85 | 79 |
|
86 |
| -inversion_param: TypeAlias = Annotated[ |
87 |
| - Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"], |
88 |
| - "Name of inversion to infer karyotype for.", |
89 |
| -] |
90 |
| - |
91 |
| - |
92 | 80 | class Ag3(AnophelesDataResource):
|
93 | 81 | """Provides access to data from Ag3.x releases.
|
94 | 82 |
|
@@ -203,6 +191,7 @@ def __init__(
|
203 | 191 | taxon_colors=TAXON_COLORS,
|
204 | 192 | virtual_contigs=VIRTUAL_CONTIGS,
|
205 | 193 | gene_names=GENE_NAMES,
|
| 194 | + inversion_tag_path=INVERSION_TAG_PATH, |
206 | 195 | )
|
207 | 196 |
|
208 | 197 | # set up caches
|
@@ -355,82 +344,3 @@ def _results_cache_add_analysis_params(self, params):
|
355 | 344 | super()._results_cache_add_analysis_params(params)
|
356 | 345 | # override parent class to add AIM analysis
|
357 | 346 | params["aim_analysis"] = self._aim_analysis
|
358 |
| - |
359 |
| - @check_types |
360 |
| - @doc( |
361 |
| - summary="Load tag SNPs for a given inversion in Ag.", |
362 |
| - ) |
363 |
| - def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame: |
364 |
| - # needs to be modified depending on where we are hosting |
365 |
| - import importlib.resources |
366 |
| - from . import resources |
367 |
| - |
368 |
| - with importlib.resources.path(resources, "karyotype_tag_snps.csv") as path: |
369 |
| - df_tag_snps = pd.read_csv(path, sep=",") |
370 |
| - return df_tag_snps.query(f"inversion == '{inversion}'").reset_index() |
371 |
| - |
372 |
| - @check_types |
373 |
| - @doc( |
374 |
| - summary="Infer karyotype from tag SNPs for a given inversion in Ag.", |
375 |
| - ) |
376 |
| - def karyotype( |
377 |
| - self, |
378 |
| - inversion: inversion_param, |
379 |
| - sample_sets: Optional[base_params.sample_sets] = None, |
380 |
| - sample_query: Optional[base_params.sample_query] = None, |
381 |
| - sample_query_options: Optional[base_params.sample_query_options] = None, |
382 |
| - ) -> pd.DataFrame: |
383 |
| - # load tag snp data |
384 |
| - df_tagsnps = self.load_inversion_tags(inversion=inversion) |
385 |
| - inversion_pos = df_tagsnps["position"] |
386 |
| - inversion_alts = df_tagsnps["alt_allele"] |
387 |
| - contig = inversion[0:2] |
388 |
| - |
389 |
| - # get snp calls for inversion region |
390 |
| - start, end = np.min(inversion_pos), np.max(inversion_pos) |
391 |
| - region = f"{contig}:{start}-{end}" |
392 |
| - |
393 |
| - ds_snps = self.snp_calls( |
394 |
| - region=region, |
395 |
| - sample_sets=sample_sets, |
396 |
| - sample_query=sample_query, |
397 |
| - sample_query_options=sample_query_options, |
398 |
| - ) |
399 |
| - |
400 |
| - with self._spinner("Inferring karyotype from tag SNPs"): |
401 |
| - # access variables we need |
402 |
| - geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data) |
403 |
| - pos = allel.SortedIndex(ds_snps["variant_position"].values) |
404 |
| - samples = ds_snps["sample_id"].values |
405 |
| - alts = ds_snps["variant_allele"].values.astype(str) |
406 |
| - |
407 |
| - # subset to position of inversion tags |
408 |
| - mask = pos.locate_intersection(inversion_pos)[0] |
409 |
| - alts = alts[mask] |
410 |
| - geno = geno.compress(mask, axis=0).compute() |
411 |
| - |
412 |
| - # infer karyotype |
413 |
| - gn_alt = _karyotype_tags_n_alt( |
414 |
| - gt=geno, alts=alts, inversion_alts=inversion_alts |
415 |
| - ) |
416 |
| - is_called = geno.is_called() |
417 |
| - |
418 |
| - # calculate mean genotype for each sample whilst masking missing calls |
419 |
| - av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0) |
420 |
| - total_sites = np.sum(is_called, axis=0) |
421 |
| - |
422 |
| - df = pd.DataFrame( |
423 |
| - { |
424 |
| - "sample_id": samples, |
425 |
| - "inversion": inversion, |
426 |
| - f"karyotype_{inversion}_mean": av_gts, |
427 |
| - # round the genotypes then convert to int |
428 |
| - f"karyotype_{inversion}": av_gts.round().astype(int), |
429 |
| - "total_tag_snps": total_sites, |
430 |
| - }, |
431 |
| - ) |
432 |
| - # Allow filling missing values with "<NA>" visible placeholder. |
433 |
| - kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True) |
434 |
| - df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype) |
435 |
| - |
436 |
| - return df |
0 commit comments