Merge branch 'GH756-mohamed-laarej-shadow' of https://github.com/malariagen/malariagen-data-python into GH756-mohamed-laarej-shadow

mohamed-laarej · mohamed-laarej · commit 260a0d3096d5 · 2025-08-04T17:30:04.000+01:00
diff --git a/README.md b/README.md
@@ -35,8 +35,7 @@ for release notes.
 
 ## Developer setup
 
-To get setup for development, see [this
-video](https://youtu.be/QniQi-Hoo9A) and the instructions below.
+To get setup for development, see [this video if you prefer VS Code](https://youtu.be/zddl3n1DCFM), or [this older video if you prefer PyCharm](https://youtu.be/QniQi-Hoo9A), and the instructions below.
 
 Fork and clone this repo:
 
@@ -48,27 +47,27 @@ Install Python, e.g.:
 
 ```bash
 sudo add-apt-repository ppa:deadsnakes/ppa
-sudo apt install python3.9 python3.9-venv
+sudo apt install python3.10 python3.10-venv
 ```
 
 Install pipx, e.g.:
 
 ```bash
-python3.9 -m pip install --user pipx
-python3.9 -m pipx ensurepath
+python3.10 -m pip install --user pipx
+python3.10 -m pipx ensurepath
 ```
 
 Install [poetry](https://python-poetry.org/docs/#installation), e.g.:
 
 ```bash
-pipx install poetry==1.8.2 --python=/usr/bin/python3.9
+pipx install poetry
 ```
 
 Create development environment:
 
 ```bash
 cd malariagen-data-python
-poetry use 3.9
+poetry use 3.10
 poetry install
 ```
 
@@ -81,7 +80,7 @@ poetry shell
 Install pre-commit and pre-commit hooks:
 
 ```bash
-pipx install pre-commit --python=/usr/bin/python3.9
+pipx install pre-commit
 pre-commit install
 ```
 
@@ -97,7 +96,9 @@ Run fast unit tests using simulated data:
 poetry run pytest -v tests/anoph
 ```
 
-To run legacy tests which read data from GCS, you'll need to [install the Google Cloud CLI](https://cloud.google.com/sdk/docs/install). E.g., if on Linux:
+To run legacy tests which read data from GCS, you'll need to [request access to MalariaGEN data on GCS](https://malariagen.github.io/vector-data/vobs/vobs-data-access.html).
+
+Once access has been granted, [install the Google Cloud CLI](https://cloud.google.com/sdk/docs/install). E.g., if on Linux:
 
 ```bash
 ./install_gcloud.sh
diff --git a/malariagen_data/anoph/frq_base.py b/malariagen_data/anoph/frq_base.py
@@ -210,7 +210,7 @@ def plot_frequencies_heatmap(
 
         # Indexing.
         if index is None:
-            index = list(df.index.names)
+            index = [str(name) for name in df.index.names]
         df = df.reset_index().copy()
         if isinstance(index, list):
             index_col = (
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -10,6 +10,8 @@
     Sequence,
     Tuple,
     Union,
+    Hashable,
+    cast,
 )
 
 import ipyleaflet  # type: ignore
@@ -49,7 +51,7 @@ def __init__(
         # data resources, and so column names and dtype need to be
         # passed in as parameters.
         self._aim_metadata_columns: Optional[List[str]] = None
-        self._aim_metadata_dtype: Dict[str, Any] = dict()
+        self._aim_metadata_dtype: Dict[str, Union[str, type, np.dtype]] = dict()
         if isinstance(aim_metadata_dtype, Mapping):
             self._aim_metadata_columns = list(aim_metadata_dtype.keys())
             self._aim_metadata_dtype.update(aim_metadata_dtype)
@@ -150,7 +152,19 @@ def _parse_general_metadata(
                 "longitude": "float64",
                 "sex_call": "object",
             }
-            df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
+            # Mapping of string dtypes to actual dtypes
+            dtype_map = {
+                "object": str,
+                "int64": np.int64,
+                "float64": np.float64,
+            }
+
+            # Convert string dtypes to actual dtypes
+            dtype_fixed: Mapping[Hashable, Union[str, np.dtype, type]] = {
+                col: dtype_map.get(dtype[col], str) for col in dtype
+            }
+
+            df = pd.read_csv(io.BytesIO(data), dtype=dtype_fixed, na_values="")
 
             # Ensure all column names are lower case.
             df.columns = [c.lower() for c in df.columns]  # type: ignore
@@ -470,7 +484,12 @@ def _parse_aim_metadata(
         if isinstance(data, bytes):
             # Parse CSV data.
             df = pd.read_csv(
-                io.BytesIO(data), dtype=self._aim_metadata_dtype, na_values=""
+                io.BytesIO(data),
+                dtype=cast(
+                    Mapping[Hashable, Union[str, type, np.dtype]],
+                    self._aim_metadata_dtype,
+                ),
+                na_values="",
             )
 
             # Ensure all column names are lower case.
@@ -901,9 +920,7 @@ def _prep_sample_selection_cache_params(
             # integer indices instead.
             df_samples = self.sample_metadata(sample_sets=sample_sets)
             sample_query_options = sample_query_options or {}
-            loc_samples = (
-                df_samples.eval(sample_query, **sample_query_options).values,
-            )
+            loc_samples = df_samples.eval(sample_query, **sample_query_options).values
             sample_indices = np.nonzero(loc_samples)[0].tolist()
 
         return sample_sets, sample_indices
diff --git a/malariagen_data/anoph/snp_frq.py b/malariagen_data/anoph/snp_frq.py
@@ -3,6 +3,7 @@
 
 import allel  # type: ignore
 import numpy as np
+import numpy.typing as npt
 import pandas as pd
 from numpydoc_decorator import doc  # type: ignore
 import xarray as xr
@@ -517,8 +518,8 @@ def snp_allele_frequencies_advanced(
 
         # Set up main event variables.
         n_variants, n_cohorts = len(variant_position), len(df_cohorts)
-        count = np.zeros((n_variants, n_cohorts), dtype=int)
-        nobs = np.zeros((n_variants, n_cohorts), dtype=int)
+        count: npt.NDArray[np.float64] = np.zeros((n_variants, n_cohorts), dtype=int)
+        nobs: npt.NDArray[np.float64] = np.zeros((n_variants, n_cohorts), dtype=int)
 
         # Build event count and nobs for each cohort.
         cohorts_iterator = self._progress(
diff --git a/malariagen_data/plasmodium.py b/malariagen_data/plasmodium.py
@@ -1,5 +1,4 @@
 import json
-import os
 
 import dask.array as da
 import pandas as pd
@@ -60,7 +59,7 @@ def sample_metadata(self):
             One row per sample.
         """
         if self._cache_sample_metadata is None:
-            path = os.path.join(self._path, self.CONF["metadata_path"])
+            path = f"{self._path}/{self.CONF['metadata_path']}"
             with self._fs.open(path) as f:
                 self._cache_sample_metadata = pd.read_csv(f, sep="\t", na_values="")
         return self._cache_sample_metadata
@@ -75,7 +74,7 @@ def _open_variant_calls_zarr(self):
 
         """
         if self._cache_variant_calls_zarr is None:
-            path = os.path.join(self._path, self.CONF["variant_calls_zarr_path"])
+            path = f"{self._path}/{self.CONF['variant_calls_zarr_path']}"
             store = init_zarr_store(fs=self._fs, path=path)
             self._cache_variant_calls_zarr = zarr.open_consolidated(store=store)
         return self._cache_variant_calls_zarr
@@ -205,7 +204,7 @@ def open_genome(self):
 
         """
         if self._cache_genome is None:
-            path = os.path.join(self._path, self.CONF["reference_path"])
+            path = f"{self._path}/{self.CONF['reference_path']}"
             store = init_zarr_store(fs=self._fs, path=path)
             self._cache_genome = zarr.open_consolidated(store=store)
         return self._cache_genome
@@ -317,7 +316,7 @@ def genome_features(self, attributes=("ID", "Parent", "Name")):
         try:
             df = self._cache_genome_features[attributes]
         except KeyError:
-            path = os.path.join(self._path, self.CONF["annotations_path"])
+            path = f"{self._path}/{self.CONF['annotations_path']}"
             with self._fs.open(path, mode="rb") as f:
                 df = read_gff3(f, compression="gzip")
             if attributes is not None:
diff --git a/notebooks/karyotype.ipynb b/notebooks/karyotype.ipynb
@@ -93,7 +93,7 @@
     "pca_df_2la, pca_evr_2la = ag3.pca(\n",
     "    region=region_2la,\n",
     "    sample_sets=sample_sets,\n",
-    "    n_snps=10_000,\n",
+    "    n_snps=50_000,\n",
     ")\n",
     "pca_df_2la = pca_df_2la.merge(kt_df_2la, on=\"sample_id\")\n",
     "pca_df_2la.head()"
@@ -180,7 +180,7 @@
     "pca_df_2rb, pca_evr_2rb = ag3.pca(\n",
     "    region=region_2rb,\n",
     "    sample_sets=sample_sets,\n",
-    "    n_snps=10_000,\n",
+    "    n_snps=50_000,\n",
     ")\n",
     "pca_df_2rb = pca_df_2rb.merge(kt_df_2rb, on=\"sample_id\")\n",
     "pca_df_2rb.head()"
@@ -262,7 +262,7 @@
     "    region=region_2rc,\n",
     "    sample_sets=sample_sets,\n",
     "    sample_query=\"taxon == 'gambiae'\",\n",
-    "    n_snps=10_000,\n",
+    "    n_snps=50_000,\n",
     ")\n",
     "pca_df_2rc_gam = pca_df_2rc_gam.merge(kt_df_2rc_gam, on=\"sample_id\")\n",
     "pca_df_2rc_gam.head()"
@@ -342,7 +342,7 @@
     "    region=region_2rc,\n",
     "    sample_sets=sample_sets,\n",
     "    sample_query=\"taxon == 'coluzzii'\",\n",
-    "    n_snps=10_000,\n",
+    "    n_snps=50_000,\n",
     ")\n",
     "pca_df_2rc_col = pca_df_2rc_col.merge(kt_df_2rc_col, on=\"sample_id\")\n",
     "pca_df_2rc_col.head()"
diff --git a/tests/anoph/test_snp_frq.py b/tests/anoph/test_snp_frq.py
@@ -161,7 +161,7 @@ def test_snp_effects(fixture, api: AnophelesSnpFrequencyAnalysis):
 
     # Check some values.
     assert np.all(df["contig"] == transcript["contig"])
-    position = df["position"].values
+    position = df["position"].to_numpy()
     assert np.all(position >= transcript["start"])
     assert np.all(position <= transcript["end"])
     assert np.all(position[1:] >= position[:-1])