Skip to content

Commit 31abfea

Browse files
authored
Merge branch 'master' into plink-converter-2024-03-26
2 parents de0b36c + 5b02632 commit 31abfea

File tree

9 files changed

+245
-37
lines changed

9 files changed

+245
-37
lines changed

malariagen_data/anoph/base.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,14 +327,18 @@ def _discover_releases(self) -> Tuple[str, ...]:
327327
)
328328
# Note: this matches v3, v3. and v3.1, but not v3001.1
329329
version_pattern = re.compile(f"^v{self._major_version_number}(\\..*)?$")
330+
# To sort the versions numerically, we use a lambda function for the "key" parameter of sorted().
331+
# The lambda function splits each version string into a list of its integer parts, using split('.') and int(), e.g. [3, 1],
332+
# which sorted() then uses to determine the order, as opposed to the default lexicographic order.
330333
discovered_releases = tuple(
331334
sorted(
332335
[
333336
self._path_to_release(d)
334337
for d in sub_dirs
335338
if version_pattern.match(d)
336339
and self._fs.exists(f"{self._base_path}/{d}/manifest.tsv")
337-
]
340+
],
341+
key=lambda v: [int(part) for part in v.split(".")],
338342
)
339343
)
340344
return discovered_releases

malariagen_data/anoph/frq_params.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Parameter definitions for functions computing and plotting allele frequencies."""
22

3-
from typing import Literal
3+
from typing import Literal, List, Optional, Tuple, Union
44

55
import xarray as xr
66
from typing_extensions import Annotated, TypeAlias
@@ -70,3 +70,13 @@
7070
bool,
7171
"Include columns with allele counts and number of non-missing allele calls (nobs).",
7272
]
73+
74+
taxa: TypeAlias = Annotated[
75+
Optional[Union[str, List[str], Tuple[str, ...]]],
76+
"The taxon or taxa to restrict the dataset to.",
77+
]
78+
79+
areas: TypeAlias = Annotated[
80+
Optional[Union[str, List[str], Tuple[str, ...]]],
81+
"The area or areas to restrict the dataset to.",
82+
]

malariagen_data/anoph/snp_frq.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,8 @@ def plot_frequencies_time_series(
936936
legend_sizing: plotly_params.legend_sizing = "constant",
937937
show: plotly_params.show = True,
938938
renderer: plotly_params.renderer = None,
939+
taxa: frq_params.taxa = None,
940+
areas: frq_params.areas = None,
939941
**kwargs,
940942
) -> plotly_params.figure:
941943
# Handle title.
@@ -947,6 +949,18 @@ def plot_frequencies_time_series(
947949
df_cohorts = ds[cohort_vars].to_dataframe()
948950
df_cohorts.columns = [c.split("cohort_")[1] for c in df_cohorts.columns] # type: ignore
949951

952+
# If specified, restrict the dataframe by taxa.
953+
if isinstance(taxa, str):
954+
df_cohorts = df_cohorts[df_cohorts["taxon"] == taxa]
955+
elif isinstance(taxa, (list, tuple)):
956+
df_cohorts = df_cohorts[df_cohorts["taxon"].isin(taxa)]
957+
958+
# If specified, restrict the dataframe by areas.
959+
if isinstance(areas, str):
960+
df_cohorts = df_cohorts[df_cohorts["area"] == areas]
961+
elif isinstance(areas, (list, tuple)):
962+
df_cohorts = df_cohorts[df_cohorts["area"].isin(areas)]
963+
950964
# Extract variant labels.
951965
variant_labels = ds["variant_label"].values
952966

malariagen_data/anopheles.py

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -888,14 +888,24 @@ def _gene_cnv(
888888
chunks,
889889
inline_array,
890890
):
891-
debug = self._log.debug
892-
893-
debug("sanity check")
891+
# Sanity check.
894892
assert isinstance(region, Region)
895893

896-
debug("access HMM data")
894+
# Access genes within the region of interest.
895+
df_genome_features = self.genome_features(region=region)
896+
sample_query_options = sample_query_options or {}
897+
df_genes = df_genome_features.query(
898+
f"type == '{self._gff_gene_type}'", **sample_query_options
899+
)
900+
901+
# Refine the region for CNV data to ensure coverage of all requested genes.
902+
cnv_region = Region(
903+
region.contig, df_genes["start"].min(), df_genes["end"].max()
904+
)
905+
906+
# Access HMM data.
897907
ds_hmm = self.cnv_hmm(
898-
region=region.contig,
908+
region=cnv_region,
899909
sample_sets=sample_sets,
900910
sample_query=sample_query,
901911
sample_query_options=sample_query_options,
@@ -909,45 +919,38 @@ def _gene_cnv(
909919
with self._dask_progress(desc="Load CNV HMM data"):
910920
pos, end, cn = dask.compute(pos, end, cn)
911921

912-
debug("access genes")
913-
df_genome_features = self.genome_features(region=region)
914-
sample_query_options = sample_query_options or {}
915-
df_genes = df_genome_features.query(
916-
f"type == '{self._gff_gene_type}'", **sample_query_options
917-
)
918-
919-
debug("setup intermediates")
922+
# Set up intermediates.
920923
windows = []
921924
modes = []
922925
counts = []
923926

924-
debug("iterate over genes")
927+
# Iterate over genes.
925928
genes_iterator = self._progress(
926929
df_genes.itertuples(),
927930
desc="Compute modal gene copy number",
928931
total=len(df_genes),
929932
)
930933
for gene in genes_iterator:
931-
# locate windows overlapping the gene
934+
# Locate windows overlapping the gene.
932935
loc_gene_start = bisect_left(end, gene.start)
933936
loc_gene_stop = bisect_right(pos, gene.end)
934937
w = loc_gene_stop - loc_gene_start
935938
windows.append(w)
936939

937-
# slice out copy number data for the given gene
940+
# Slice out copy number data for the given gene.
938941
cn_gene = cn[loc_gene_start:loc_gene_stop]
939942

940-
# compute the modes
943+
# Compute the modes.
941944
m, c = _cn_mode(cn_gene, vmax=12)
942945
modes.append(m)
943946
counts.append(c)
944947

945-
debug("combine results")
948+
# Combine results.
946949
windows = np.array(windows)
947950
modes = np.vstack(modes)
948951
counts = np.vstack(counts)
949952

950-
debug("build dataset")
953+
# Build dataset.
951954
ds_out = xr.Dataset(
952955
coords={
953956
"gene_id": (["genes"], df_genes["ID"].values),
@@ -1182,6 +1185,11 @@ def _gene_cnv_frequencies(
11821185

11831186
freq_cols[f"frq_{coh}"] = np.concatenate([amp_freq_coh, del_freq_coh])
11841187

1188+
if len(coh_dict) == 0:
1189+
raise ValueError(
1190+
"No cohorts available for the given sample selection parameters and minimum cohort size."
1191+
)
1192+
11851193
debug("build a dataframe with the frequency columns")
11861194
df_freqs = pd.DataFrame(freq_cols)
11871195

notebooks/plot_frequencies_heatmap.ipynb

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,44 @@
381381
"id": "86c5c594",
382382
"metadata": {},
383383
"outputs": [],
384+
"source": [
385+
"interesting_cyp_genes = [\n",
386+
" \"AGAP002862\", # Cyp6aa1\n",
387+
" \"AGAP013128\", # Cyp6aa2\n",
388+
" \"AGAP002865\", # Cyp6p3\n",
389+
" \"AGAP000818\", # Cyp9k1\n",
390+
" \"AGAP008212\", # Cyp6m2\n",
391+
" \"AGAP008218\", # Cyp6z2 \n",
392+
"]\n",
393+
"\n",
394+
"cyp_cnv_freqs_df = ag3.gene_cnv_frequencies(\n",
395+
" region=interesting_cyp_genes,\n",
396+
" cohorts=\"admin1_year\",\n",
397+
" sample_sets=(\"AG1000G-BF-A\", \"AG1000G-BF-B\", \"AG1000G-BF-C\"),\n",
398+
" sample_query=\"taxon == 'coluzzii'\",\n",
399+
")"
400+
]
401+
},
402+
{
403+
"cell_type": "code",
404+
"execution_count": null,
405+
"id": "6d7ad130-30c2-4cd3-8906-a7ada3ccc75f",
406+
"metadata": {},
407+
"outputs": [],
408+
"source": [
409+
"ag3.plot_frequencies_heatmap(\n",
410+
" df=cyp_cnv_freqs_df,\n",
411+
" color_continuous_scale=\"Blues\",\n",
412+
" title=\"Cyp gene CNV frequencies\",\n",
413+
")"
414+
]
415+
},
416+
{
417+
"cell_type": "code",
418+
"execution_count": null,
419+
"id": "83aab417-632e-4fd2-8da4-3ffdd6e233f6",
420+
"metadata": {},
421+
"outputs": [],
384422
"source": []
385423
}
386424
],

notebooks/plot_frequencies_space_time.ipynb

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,22 @@
11
{
22
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "47f669f3",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import malariagen_data"
11+
]
12+
},
313
{
414
"cell_type": "code",
515
"execution_count": null,
616
"id": "f820bc66-2fb2-4ca2-9b54-824e50d61a0a",
717
"metadata": {},
818
"outputs": [],
919
"source": [
10-
"import malariagen_data\n",
11-
"\n",
1220
"ag3 = malariagen_data.Ag3(\n",
1321
" \"simplecache::gs://vo_agam_release_master_us_central1\",\n",
1422
" simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
@@ -23,8 +31,6 @@
2331
"metadata": {},
2432
"outputs": [],
2533
"source": [
26-
"import malariagen_data\n",
27-
"\n",
2834
"af1 = malariagen_data.Af1(\n",
2935
" \"simplecache::gs://vo_afun_release_master_us_central1\",\n",
3036
" simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
@@ -69,6 +75,26 @@
6975
"ag3.plot_frequencies_time_series(ds, height=500, width=1000)"
7076
]
7177
},
78+
{
79+
"cell_type": "code",
80+
"execution_count": null,
81+
"id": "790c99e8",
82+
"metadata": {},
83+
"outputs": [],
84+
"source": [
85+
"ag3.plot_frequencies_time_series(ds, taxa=\"gambiae\", height=500, width=1000)"
86+
]
87+
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": null,
91+
"id": "1bfc7298",
92+
"metadata": {},
93+
"outputs": [],
94+
"source": [
95+
"ag3.plot_frequencies_time_series(ds, taxa=(\"gambiae\", \"arabiensis\"), height=500, width=1000)"
96+
]
97+
},
7298
{
7399
"cell_type": "code",
74100
"execution_count": null,
@@ -252,6 +278,26 @@
252278
"ag3.plot_frequencies_time_series(ds, height=900, width=900)"
253279
]
254280
},
281+
{
282+
"cell_type": "code",
283+
"execution_count": null,
284+
"id": "e16ab3fe",
285+
"metadata": {},
286+
"outputs": [],
287+
"source": [
288+
"ag3.plot_frequencies_time_series(ds, areas=\"BF-09\", height=400, width=900)"
289+
]
290+
},
291+
{
292+
"cell_type": "code",
293+
"execution_count": null,
294+
"id": "26af27a1",
295+
"metadata": {},
296+
"outputs": [],
297+
"source": [
298+
"ag3.plot_frequencies_time_series(ds, areas=(\"BF-09\", \"TZ-25\"), height=400, width=900)"
299+
]
300+
},
255301
{
256302
"cell_type": "code",
257303
"execution_count": null,
@@ -336,19 +382,11 @@
336382
"source": [
337383
"af1.plot_frequencies_interactive_map(ds)"
338384
]
339-
},
340-
{
341-
"cell_type": "code",
342-
"execution_count": null,
343-
"id": "a512b459",
344-
"metadata": {},
345-
"outputs": [],
346-
"source": []
347385
}
348386
],
349387
"metadata": {
350388
"kernelspec": {
351-
"display_name": "Python 3 (ipykernel)",
389+
"display_name": "mgen_data_py3.11",
352390
"language": "python",
353391
"name": "python3"
354392
},
@@ -362,7 +400,7 @@
362400
"name": "python",
363401
"nbconvert_exporter": "python",
364402
"pygments_lexer": "ipython3",
365-
"version": "3.10.12"
403+
"version": "3.11.5"
366404
},
367405
"widgets": {
368406
"application/vnd.jupyter.widget-state+json": {

0 commit comments

Comments
 (0)