Skip to content

Commit 260a0d3

Browse files
Merge branch 'GH756-mohamed-laarej-shadow' of https://github.com/malariagen/malariagen-data-python into GH756-mohamed-laarej-shadow
2 parents ee3f144 + 0ddf4ca commit 260a0d3

File tree

7 files changed

+46
-28
lines changed

7 files changed

+46
-28
lines changed

README.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,7 @@ for release notes.
3535

3636
## Developer setup
3737

38-
To get setup for development, see [this
39-
video](https://youtu.be/QniQi-Hoo9A) and the instructions below.
38+
To get setup for development, see [this video if you prefer VS Code](https://youtu.be/zddl3n1DCFM), or [this older video if you prefer PyCharm](https://youtu.be/QniQi-Hoo9A), and the instructions below.
4039

4140
Fork and clone this repo:
4241

@@ -48,27 +47,27 @@ Install Python, e.g.:
4847

4948
```bash
5049
sudo add-apt-repository ppa:deadsnakes/ppa
51-
sudo apt install python3.9 python3.9-venv
50+
sudo apt install python3.10 python3.10-venv
5251
```
5352

5453
Install pipx, e.g.:
5554

5655
```bash
57-
python3.9 -m pip install --user pipx
58-
python3.9 -m pipx ensurepath
56+
python3.10 -m pip install --user pipx
57+
python3.10 -m pipx ensurepath
5958
```
6059

6160
Install [poetry](https://python-poetry.org/docs/#installation), e.g.:
6261

6362
```bash
64-
pipx install poetry==1.8.2 --python=/usr/bin/python3.9
63+
pipx install poetry
6564
```
6665

6766
Create development environment:
6867

6968
```bash
7069
cd malariagen-data-python
71-
poetry use 3.9
70+
poetry use 3.10
7271
poetry install
7372
```
7473

@@ -81,7 +80,7 @@ poetry shell
8180
Install pre-commit and pre-commit hooks:
8281

8382
```bash
84-
pipx install pre-commit --python=/usr/bin/python3.9
83+
pipx install pre-commit
8584
pre-commit install
8685
```
8786

@@ -97,7 +96,9 @@ Run fast unit tests using simulated data:
9796
poetry run pytest -v tests/anoph
9897
```
9998

100-
To run legacy tests which read data from GCS, you'll need to [install the Google Cloud CLI](https://cloud.google.com/sdk/docs/install). E.g., if on Linux:
99+
To run legacy tests which read data from GCS, you'll need to [request access to MalariaGEN data on GCS](https://malariagen.github.io/vector-data/vobs/vobs-data-access.html).
100+
101+
Once access has been granted, [install the Google Cloud CLI](https://cloud.google.com/sdk/docs/install). E.g., if on Linux:
101102

102103
```bash
103104
./install_gcloud.sh

malariagen_data/anoph/frq_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def plot_frequencies_heatmap(
210210

211211
# Indexing.
212212
if index is None:
213-
index = list(df.index.names)
213+
index = [str(name) for name in df.index.names]
214214
df = df.reset_index().copy()
215215
if isinstance(index, list):
216216
index_col = (

malariagen_data/anoph/sample_metadata.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
Sequence,
1111
Tuple,
1212
Union,
13+
Hashable,
14+
cast,
1315
)
1416

1517
import ipyleaflet # type: ignore
@@ -49,7 +51,7 @@ def __init__(
4951
# data resources, and so column names and dtype need to be
5052
# passed in as parameters.
5153
self._aim_metadata_columns: Optional[List[str]] = None
52-
self._aim_metadata_dtype: Dict[str, Any] = dict()
54+
self._aim_metadata_dtype: Dict[str, Union[str, type, np.dtype]] = dict()
5355
if isinstance(aim_metadata_dtype, Mapping):
5456
self._aim_metadata_columns = list(aim_metadata_dtype.keys())
5557
self._aim_metadata_dtype.update(aim_metadata_dtype)
@@ -150,7 +152,19 @@ def _parse_general_metadata(
150152
"longitude": "float64",
151153
"sex_call": "object",
152154
}
153-
df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
155+
# Mapping of string dtypes to actual dtypes
156+
dtype_map = {
157+
"object": str,
158+
"int64": np.int64,
159+
"float64": np.float64,
160+
}
161+
162+
# Convert string dtypes to actual dtypes
163+
dtype_fixed: Mapping[Hashable, Union[str, np.dtype, type]] = {
164+
col: dtype_map.get(dtype[col], str) for col in dtype
165+
}
166+
167+
df = pd.read_csv(io.BytesIO(data), dtype=dtype_fixed, na_values="")
154168

155169
# Ensure all column names are lower case.
156170
df.columns = [c.lower() for c in df.columns] # type: ignore
@@ -470,7 +484,12 @@ def _parse_aim_metadata(
470484
if isinstance(data, bytes):
471485
# Parse CSV data.
472486
df = pd.read_csv(
473-
io.BytesIO(data), dtype=self._aim_metadata_dtype, na_values=""
487+
io.BytesIO(data),
488+
dtype=cast(
489+
Mapping[Hashable, Union[str, type, np.dtype]],
490+
self._aim_metadata_dtype,
491+
),
492+
na_values="",
474493
)
475494

476495
# Ensure all column names are lower case.
@@ -901,9 +920,7 @@ def _prep_sample_selection_cache_params(
901920
# integer indices instead.
902921
df_samples = self.sample_metadata(sample_sets=sample_sets)
903922
sample_query_options = sample_query_options or {}
904-
loc_samples = (
905-
df_samples.eval(sample_query, **sample_query_options).values,
906-
)
923+
loc_samples = df_samples.eval(sample_query, **sample_query_options).values
907924
sample_indices = np.nonzero(loc_samples)[0].tolist()
908925

909926
return sample_sets, sample_indices

malariagen_data/anoph/snp_frq.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import allel # type: ignore
55
import numpy as np
6+
import numpy.typing as npt
67
import pandas as pd
78
from numpydoc_decorator import doc # type: ignore
89
import xarray as xr
@@ -517,8 +518,8 @@ def snp_allele_frequencies_advanced(
517518

518519
# Set up main event variables.
519520
n_variants, n_cohorts = len(variant_position), len(df_cohorts)
520-
count = np.zeros((n_variants, n_cohorts), dtype=int)
521-
nobs = np.zeros((n_variants, n_cohorts), dtype=int)
521+
count: npt.NDArray[np.float64] = np.zeros((n_variants, n_cohorts), dtype=int)
522+
nobs: npt.NDArray[np.float64] = np.zeros((n_variants, n_cohorts), dtype=int)
522523

523524
# Build event count and nobs for each cohort.
524525
cohorts_iterator = self._progress(

malariagen_data/plasmodium.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import json
2-
import os
32

43
import dask.array as da
54
import pandas as pd
@@ -60,7 +59,7 @@ def sample_metadata(self):
6059
One row per sample.
6160
"""
6261
if self._cache_sample_metadata is None:
63-
path = os.path.join(self._path, self.CONF["metadata_path"])
62+
path = f"{self._path}/{self.CONF['metadata_path']}"
6463
with self._fs.open(path) as f:
6564
self._cache_sample_metadata = pd.read_csv(f, sep="\t", na_values="")
6665
return self._cache_sample_metadata
@@ -75,7 +74,7 @@ def _open_variant_calls_zarr(self):
7574
7675
"""
7776
if self._cache_variant_calls_zarr is None:
78-
path = os.path.join(self._path, self.CONF["variant_calls_zarr_path"])
77+
path = f"{self._path}/{self.CONF['variant_calls_zarr_path']}"
7978
store = init_zarr_store(fs=self._fs, path=path)
8079
self._cache_variant_calls_zarr = zarr.open_consolidated(store=store)
8180
return self._cache_variant_calls_zarr
@@ -205,7 +204,7 @@ def open_genome(self):
205204
206205
"""
207206
if self._cache_genome is None:
208-
path = os.path.join(self._path, self.CONF["reference_path"])
207+
path = f"{self._path}/{self.CONF['reference_path']}"
209208
store = init_zarr_store(fs=self._fs, path=path)
210209
self._cache_genome = zarr.open_consolidated(store=store)
211210
return self._cache_genome
@@ -317,7 +316,7 @@ def genome_features(self, attributes=("ID", "Parent", "Name")):
317316
try:
318317
df = self._cache_genome_features[attributes]
319318
except KeyError:
320-
path = os.path.join(self._path, self.CONF["annotations_path"])
319+
path = f"{self._path}/{self.CONF['annotations_path']}"
321320
with self._fs.open(path, mode="rb") as f:
322321
df = read_gff3(f, compression="gzip")
323322
if attributes is not None:

notebooks/karyotype.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
"pca_df_2la, pca_evr_2la = ag3.pca(\n",
9494
" region=region_2la,\n",
9595
" sample_sets=sample_sets,\n",
96-
" n_snps=10_000,\n",
96+
" n_snps=50_000,\n",
9797
")\n",
9898
"pca_df_2la = pca_df_2la.merge(kt_df_2la, on=\"sample_id\")\n",
9999
"pca_df_2la.head()"
@@ -180,7 +180,7 @@
180180
"pca_df_2rb, pca_evr_2rb = ag3.pca(\n",
181181
" region=region_2rb,\n",
182182
" sample_sets=sample_sets,\n",
183-
" n_snps=10_000,\n",
183+
" n_snps=50_000,\n",
184184
")\n",
185185
"pca_df_2rb = pca_df_2rb.merge(kt_df_2rb, on=\"sample_id\")\n",
186186
"pca_df_2rb.head()"
@@ -262,7 +262,7 @@
262262
" region=region_2rc,\n",
263263
" sample_sets=sample_sets,\n",
264264
" sample_query=\"taxon == 'gambiae'\",\n",
265-
" n_snps=10_000,\n",
265+
" n_snps=50_000,\n",
266266
")\n",
267267
"pca_df_2rc_gam = pca_df_2rc_gam.merge(kt_df_2rc_gam, on=\"sample_id\")\n",
268268
"pca_df_2rc_gam.head()"
@@ -342,7 +342,7 @@
342342
" region=region_2rc,\n",
343343
" sample_sets=sample_sets,\n",
344344
" sample_query=\"taxon == 'coluzzii'\",\n",
345-
" n_snps=10_000,\n",
345+
" n_snps=50_000,\n",
346346
")\n",
347347
"pca_df_2rc_col = pca_df_2rc_col.merge(kt_df_2rc_col, on=\"sample_id\")\n",
348348
"pca_df_2rc_col.head()"

tests/anoph/test_snp_frq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def test_snp_effects(fixture, api: AnophelesSnpFrequencyAnalysis):
161161

162162
# Check some values.
163163
assert np.all(df["contig"] == transcript["contig"])
164-
position = df["position"].values
164+
position = df["position"].to_numpy()
165165
assert np.all(position >= transcript["start"])
166166
assert np.all(position <= transcript["end"])
167167
assert np.all(position[1:] >= position[:-1])

0 commit comments

Comments
 (0)