Merge branch 'master' into plink-converter-2024-03-26

jonbrenas · web-flow · commit 31abfeacbad7 · 2024-11-28T16:12:36.000Z
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
@@ -327,14 +327,18 @@ def _discover_releases(self) -> Tuple[str, ...]:
         )
         # Note: this matches v3, v3. and v3.1, but not v3001.1
         version_pattern = re.compile(f"^v{self._major_version_number}(\\..*)?$")
+        # To sort the versions numerically, we use a lambda function for the "key" parameter of sorted().
+        # The lambda function splits each version string into a list of its integer parts, using split('.') and int(), e.g. [3, 1],
+        # which sorted() then uses to determine the order, as opposed to the default lexicographic order.
         discovered_releases = tuple(
             sorted(
                 [
                     self._path_to_release(d)
                     for d in sub_dirs
                     if version_pattern.match(d)
                     and self._fs.exists(f"{self._base_path}/{d}/manifest.tsv")
-                ]
+                ],
+                key=lambda v: [int(part) for part in v.split(".")],
             )
         )
         return discovered_releases
diff --git a/malariagen_data/anoph/frq_params.py b/malariagen_data/anoph/frq_params.py
@@ -1,6 +1,6 @@
 """Parameter definitions for functions computing and plotting allele frequencies."""
 
-from typing import Literal
+from typing import Literal, List, Optional, Tuple, Union
 
 import xarray as xr
 from typing_extensions import Annotated, TypeAlias
@@ -70,3 +70,13 @@
     bool,
     "Include columns with allele counts and number of non-missing allele calls (nobs).",
 ]
+
+taxa: TypeAlias = Annotated[
+    Optional[Union[str, List[str], Tuple[str, ...]]],
+    "The taxon or taxa to restrict the dataset to.",
+]
+
+areas: TypeAlias = Annotated[
+    Optional[Union[str, List[str], Tuple[str, ...]]],
+    "The area or areas to restrict the dataset to.",
+]
diff --git a/malariagen_data/anoph/snp_frq.py b/malariagen_data/anoph/snp_frq.py
@@ -936,6 +936,8 @@ def plot_frequencies_time_series(
         legend_sizing: plotly_params.legend_sizing = "constant",
         show: plotly_params.show = True,
         renderer: plotly_params.renderer = None,
+        taxa: frq_params.taxa = None,
+        areas: frq_params.areas = None,
         **kwargs,
     ) -> plotly_params.figure:
         # Handle title.
@@ -947,6 +949,18 @@ def plot_frequencies_time_series(
         df_cohorts = ds[cohort_vars].to_dataframe()
         df_cohorts.columns = [c.split("cohort_")[1] for c in df_cohorts.columns]  # type: ignore
 
+        # If specified, restrict the dataframe by taxa.
+        if isinstance(taxa, str):
+            df_cohorts = df_cohorts[df_cohorts["taxon"] == taxa]
+        elif isinstance(taxa, (list, tuple)):
+            df_cohorts = df_cohorts[df_cohorts["taxon"].isin(taxa)]
+
+        # If specified, restrict the dataframe by areas.
+        if isinstance(areas, str):
+            df_cohorts = df_cohorts[df_cohorts["area"] == areas]
+        elif isinstance(areas, (list, tuple)):
+            df_cohorts = df_cohorts[df_cohorts["area"].isin(areas)]
+
         # Extract variant labels.
         variant_labels = ds["variant_label"].values
 
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
@@ -888,14 +888,24 @@ def _gene_cnv(
         chunks,
         inline_array,
     ):
-        debug = self._log.debug
-
-        debug("sanity check")
+        # Sanity check.
         assert isinstance(region, Region)
 
-        debug("access HMM data")
+        # Access genes within the region of interest.
+        df_genome_features = self.genome_features(region=region)
+        sample_query_options = sample_query_options or {}
+        df_genes = df_genome_features.query(
+            f"type == '{self._gff_gene_type}'", **sample_query_options
+        )
+
+        # Refine the region for CNV data to ensure coverage of all requested genes.
+        cnv_region = Region(
+            region.contig, df_genes["start"].min(), df_genes["end"].max()
+        )
+
+        # Access HMM data.
         ds_hmm = self.cnv_hmm(
-            region=region.contig,
+            region=cnv_region,
             sample_sets=sample_sets,
             sample_query=sample_query,
             sample_query_options=sample_query_options,
@@ -909,45 +919,38 @@ def _gene_cnv(
         with self._dask_progress(desc="Load CNV HMM data"):
             pos, end, cn = dask.compute(pos, end, cn)
 
-        debug("access genes")
-        df_genome_features = self.genome_features(region=region)
-        sample_query_options = sample_query_options or {}
-        df_genes = df_genome_features.query(
-            f"type == '{self._gff_gene_type}'", **sample_query_options
-        )
-
-        debug("setup intermediates")
+        # Set up intermediates.
         windows = []
         modes = []
         counts = []
 
-        debug("iterate over genes")
+        # Iterate over genes.
         genes_iterator = self._progress(
             df_genes.itertuples(),
             desc="Compute modal gene copy number",
             total=len(df_genes),
         )
         for gene in genes_iterator:
-            # locate windows overlapping the gene
+            # Locate windows overlapping the gene.
             loc_gene_start = bisect_left(end, gene.start)
             loc_gene_stop = bisect_right(pos, gene.end)
             w = loc_gene_stop - loc_gene_start
             windows.append(w)
 
-            # slice out copy number data for the given gene
+            # Slice out copy number data for the given gene.
             cn_gene = cn[loc_gene_start:loc_gene_stop]
 
-            # compute the modes
+            # Compute the modes.
             m, c = _cn_mode(cn_gene, vmax=12)
             modes.append(m)
             counts.append(c)
 
-        debug("combine results")
+        # Combine results.
         windows = np.array(windows)
         modes = np.vstack(modes)
         counts = np.vstack(counts)
 
-        debug("build dataset")
+        # Build dataset.
         ds_out = xr.Dataset(
             coords={
                 "gene_id": (["genes"], df_genes["ID"].values),
@@ -1182,6 +1185,11 @@ def _gene_cnv_frequencies(
 
                 freq_cols[f"frq_{coh}"] = np.concatenate([amp_freq_coh, del_freq_coh])
 
+        if len(coh_dict) == 0:
+            raise ValueError(
+                "No cohorts available for the given sample selection parameters and minimum cohort size."
+            )
+
         debug("build a dataframe with the frequency columns")
         df_freqs = pd.DataFrame(freq_cols)
 
diff --git a/notebooks/plot_frequencies_heatmap.ipynb b/notebooks/plot_frequencies_heatmap.ipynb
@@ -381,6 +381,44 @@
    "id": "86c5c594",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "interesting_cyp_genes = [\n",
+    "    \"AGAP002862\",  # Cyp6aa1\n",
+    "    \"AGAP013128\",  # Cyp6aa2\n",
+    "    \"AGAP002865\",  # Cyp6p3\n",
+    "    \"AGAP000818\",  # Cyp9k1\n",
+    "    \"AGAP008212\",  # Cyp6m2\n",
+    "    \"AGAP008218\",  # Cyp6z2    \n",
+    "]\n",
+    "\n",
+    "cyp_cnv_freqs_df = ag3.gene_cnv_frequencies(\n",
+    "    region=interesting_cyp_genes,\n",
+    "    cohorts=\"admin1_year\",\n",
+    "    sample_sets=(\"AG1000G-BF-A\", \"AG1000G-BF-B\", \"AG1000G-BF-C\"),\n",
+    "    sample_query=\"taxon == 'coluzzii'\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d7ad130-30c2-4cd3-8906-a7ada3ccc75f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_frequencies_heatmap(\n",
+    "    df=cyp_cnv_freqs_df,\n",
+    "    color_continuous_scale=\"Blues\",\n",
+    "    title=\"Cyp gene CNV frequencies\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83aab417-632e-4fd2-8da4-3ffdd6e233f6",
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],
diff --git a/notebooks/plot_frequencies_space_time.ipynb b/notebooks/plot_frequencies_space_time.ipynb
@@ -1,14 +1,22 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47f669f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import malariagen_data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "f820bc66-2fb2-4ca2-9b54-824e50d61a0a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import malariagen_data\n",
-    "\n",
     "ag3 = malariagen_data.Ag3(\n",
     "    \"simplecache::gs://vo_agam_release_master_us_central1\",\n",
     "    simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
@@ -23,8 +31,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import malariagen_data\n",
-    "\n",
     "af1 = malariagen_data.Af1(\n",
     "    \"simplecache::gs://vo_afun_release_master_us_central1\",\n",
     "    simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
@@ -69,6 +75,26 @@
     "ag3.plot_frequencies_time_series(ds, height=500, width=1000)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "790c99e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_frequencies_time_series(ds, taxa=\"gambiae\", height=500, width=1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bfc7298",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_frequencies_time_series(ds, taxa=(\"gambiae\", \"arabiensis\"), height=500, width=1000)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -252,6 +278,26 @@
     "ag3.plot_frequencies_time_series(ds, height=900, width=900)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e16ab3fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_frequencies_time_series(ds, areas=\"BF-09\", height=400, width=900)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26af27a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_frequencies_time_series(ds, areas=(\"BF-09\", \"TZ-25\"), height=400, width=900)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -336,19 +382,11 @@
    "source": [
     "af1.plot_frequencies_interactive_map(ds)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a512b459",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "mgen_data_py3.11",
    "language": "python",
    "name": "python3"
   },
@@ -362,7 +400,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.5"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
diff --git a/tests/anoph/test_snp_frq.py b/tests/anoph/test_snp_frq.py
diff --git a/tests/test_af1.py b/tests/test_af1.py
diff --git a/tests/test_ag3.py b/tests/test_ag3.py

Original file line number	Diff line number	Diff line change
`@@ -327,14 +327,18 @@ def _discover_releases(self) -> Tuple[str, ...]:`
`327`	`327`	`)`
`328`	`328`	`# Note: this matches v3, v3. and v3.1, but not v3001.1`
`329`	`329`	`version_pattern = re.compile(f"^v{self._major_version_number}(\\..*)?$")`
	`330`	`+ # To sort the versions numerically, we use a lambda function for the "key" parameter of sorted().`
	`331`	`+ # The lambda function splits each version string into a list of its integer parts, using split('.') and int(), e.g. [3, 1],`
	`332`	`+ # which sorted() then uses to determine the order, as opposed to the default lexicographic order.`
`330`	`333`	`discovered_releases = tuple(`
`331`	`334`	`sorted(`
`332`	`335`	`[`
`333`	`336`	`self._path_to_release(d)`
`334`	`337`	`for d in sub_dirs`
`335`	`338`	`if version_pattern.match(d)`
`336`	`339`	`and self._fs.exists(f"{self._base_path}/{d}/manifest.tsv")`
`337`		`- ]`
	`340`	`+ ],`
	`341`	`+ key=lambda v: [int(part) for part in v.split(".")],`
`338`	`342`	`)`
`339`	`343`	`)`
`340`	`344`	`return discovered_releases`