Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
fc6c2ba
WIP: dev support for unrestricted_use_only, surveillance_use_only
leehart Feb 7, 2025
dfdd4e2
Add test sample sets for Af1 with unrestricted_use_only. Add relevant…
leehart Feb 18, 2025
1b6cbb9
Update comment re skipping test due to lack of relevant fixtures
leehart Feb 18, 2025
02921c9
Add surveillance flags to sample_metadata(). Add tests.
leehart Feb 20, 2025
d4af40a
Merge branch 'master' into GH716_add_constructor_params
leehart Feb 20, 2025
d4e7e70
Merge branch 'master' into GH716_add_constructor_params
leehart Mar 18, 2025
19902b0
WIP: add _prep_sample_query_param() stub where _prep_sample_set_param()
leehart Mar 20, 2025
d7b8383
Add logic to _prep_sample_query_param() to honour self._surveillance_…
leehart Mar 21, 2025
de0daf8
Merge branch 'master' into GH716_add_constructor_params
leehart Mar 21, 2025
435e8a7
Allow _prep_sample_query_param() to return None
leehart Mar 21, 2025
bde3d4e
Return consistent data type from _prep_sample_query_param()
leehart Mar 21, 2025
bfed3f4
Merge branch 'master' into GH716_add_constructor_params
leehart Apr 8, 2025
78d26d1
Merge branch 'master' into GH716_add_constructor_params
leehart Apr 8, 2025
6396126
Merge branch 'master' into GH716_add_constructor_params
leehart Apr 24, 2025
50b3f5c
Add new public_url param to sample_metadata tests
leehart Apr 24, 2025
23e9012
Merge branch 'master' into GH716_add_constructor_params
leehart Apr 29, 2025
ea950fc
Merge branch 'master' into GH716_add_constructor_params
leehart May 1, 2025
fdebfd4
WIP: dev support for unrestricted_use_only, surveillance_use_only params
leehart May 23, 2025
62a848e
Merge branch 'master' into GH716_add_constructor_params
leehart May 23, 2025
d125707
WIP: amend data types
leehart May 23, 2025
a9f44c4
Add doc for _surveillance_flags sample_sets param
leehart May 23, 2025
3bad016
WIP: dev support for unrestricted_use_only, surveillance_use_only
leehart May 29, 2025
88347b0
WIP: update cnv_discordant_read_calls to honour constructor params
leehart May 29, 2025
519fec2
Convert dtype dict to defaultdict for pd.read_csv
leehart May 29, 2025
f64e64f
Convert df.index.names to List[str] for list
leehart May 29, 2025
613185c
Ignore type check for untyped array comparison in test_snp_frq.py. Re…
leehart May 29, 2025
e041a91
Use defaultdict for _aim_metadata_dtype for pd.read_csv
leehart May 29, 2025
82fa84d
Amend defaultdict assignment for _aim_metadata_dtype
leehart May 29, 2025
09f224a
Amend dtype data type for pd.read_csv
leehart May 29, 2025
21af2d2
Fix bug in applying aim_metadata_dtype. Amend data types.
leehart May 30, 2025
67ffa47
Raise ValueError when view_alignments is given irrelevant sample
leehart May 30, 2025
23e2e7c
Use raise from when re-wrapping exceptions in base.py to provide bett…
leehart May 30, 2025
e16eabb
Merge branch 'master' into GH716_add_constructor_params
leehart Jun 2, 2025
65fd83c
WIP: dev support for surveillance_use_only, unrestricted_use_only params
leehart Jun 3, 2025
7fce9bb
Revert cache name for biallelic_diplotypes. (Function behaviour uncha…
leehart Jun 3, 2025
df207c8
WIP: dev support for surveillance_use_only, unrestricted_use_only params
leehart Jun 3, 2025
d927b06
Fix misspelling
leehart Jun 3, 2025
1177e63
Use python engine for sample_query to support extension dtypes
leehart Jun 5, 2025
d9ba5eb
Add validate_sample_selection_params to funcs with sample_query, samp…
leehart Jun 10, 2025
9b7d6cc
Amend sample_metadata to allow sample_indices when surveillance_use_only
leehart Jun 10, 2025
6c4e74f
WIP: handle sample_indices when surveillance_use_only
leehart Jun 13, 2025
4b9904f
Amend snp_genotypes to handle sample_indices when surveillance_use_only
leehart Jun 17, 2025
0f20d3a
Merge branch 'master' into GH716_add_constructor_params
leehart Jul 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 33 additions & 8 deletions malariagen_data/af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def __init__(
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
unrestricted_use_only=False,
surveillance_use_only=False,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
Expand Down Expand Up @@ -127,18 +129,23 @@ def __init__(
virtual_contigs=None,
gene_names=None,
inversion_tag_path=None,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
)

def __repr__(self):
text = (
f"<MalariaGEN Af1 API client>\n"
f"Storage URL : {self._url}\n"
f"Data releases available : {', '.join(self.releases)}\n"
f"Results cache : {self._results_cache}\n"
f"Cohorts analysis : {self._cohorts_analysis}\n"
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Storage URL : {self._url}\n"
f"Data releases available : {', '.join(self._available_releases)}\n"
f"Results cache : {self._results_cache}\n"
f"Cohorts analysis : {self._cohorts_analysis}\n"
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"Relevant data releases : {', '.join(self.releases)}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
Expand Down Expand Up @@ -172,7 +179,7 @@ def _repr_html_(self):
<th style="text-align: left">
Data releases available
</th>
<td>{', '.join(self.releases)}</td>
<td>{', '.join(self._available_releases)}</td>
</tr>
<tr>
<th style="text-align: left">
Expand Down Expand Up @@ -204,6 +211,24 @@ def _repr_html_(self):
</th>
<td>{self.client_location}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for unrestricted use only
</th>
<td>{self._unrestricted_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for surveillance use only
</th>
<td>{self._surveillance_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Relevant data releases
</th>
<td>{', '.join(self.releases)}</td>
</tr>
</tbody>
</table>
"""
Expand Down
94 changes: 76 additions & 18 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,18 @@ def _setup_aim_palettes():
"unassigned": "black",
}

# Note: These column names will be treated as case-insensitive,
# because these column names and the column names from the CSV
# will be converted to lowercase before applying these dtypes.
AIM_METADATA_DTYPE = {
"aim_species_fraction_arab": "float64",
"aim_species_fraction_colu": "float64",
"aim_species_fraction_colu_no2l": "float64",
"aim_species_gambcolu_arabiensis": "object",
"aim_species_gambiae_coluzzii": "object",
"aim_species": "object",
}


class Ag3(AnophelesDataResource):
"""Provides access to data from Ag3.x releases.
Expand Down Expand Up @@ -150,6 +162,8 @@ def __init__(
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
unrestricted_use_only=False,
surveillance_use_only=False,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
Expand All @@ -158,14 +172,7 @@ def __init__(
config_path=CONFIG_PATH,
cohorts_analysis=cohorts_analysis,
aim_analysis=aim_analysis,
aim_metadata_dtype={
"aim_species_fraction_arab": "float64",
"aim_species_fraction_colu": "float64",
"aim_species_fraction_colu_no2l": "float64",
"aim_species_gambcolu_arabiensis": "object",
"aim_species_gambiae_coluzzii": "object",
"aim_species": "object",
},
aim_metadata_dtype=AIM_METADATA_DTYPE,
aim_ids=("gambcolu_vs_arab", "gamb_vs_colu"),
aim_palettes=AIM_PALETTES,
site_filters_analysis=site_filters_analysis,
Expand Down Expand Up @@ -193,6 +200,8 @@ def __init__(
virtual_contigs=VIRTUAL_CONTIGS,
gene_names=GENE_NAMES,
inversion_tag_path=INVERSION_TAG_PATH,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
)

# set up caches
Expand All @@ -204,21 +213,24 @@ def v3_wild(self):
3.0 release, excluding the lab crosses."""
return [
x
for x in self.sample_sets(release="3.0")["sample_set"].tolist()
for x in self._available_sample_sets(release="3.0")["sample_set"].tolist()
if x != "AG1000G-X"
]

def __repr__(self):
text = (
f"<MalariaGEN Ag3 API client>\n"
f"Storage URL : {self._url}\n"
f"Data releases available : {', '.join(self.releases)}\n"
f"Results cache : {self._results_cache}\n"
f"Cohorts analysis : {self._cohorts_analysis}\n"
f"AIM analysis : {self._aim_analysis}\n"
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Storage URL : {self._url}\n"
f"Data releases available : {', '.join(self._available_releases)}\n"
f"Results cache : {self._results_cache}\n"
f"Cohorts analysis : {self._cohorts_analysis}\n"
f"AIM analysis : {self._aim_analysis}\n"
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"Relevant data releases : {', '.join(self.releases)}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
Expand Down Expand Up @@ -252,7 +264,7 @@ def _repr_html_(self):
<th style="text-align: left">
Data releases available
</th>
<td>{', '.join(self.releases)}</td>
<td>{', '.join(self._available_releases)}</td>
</tr>
<tr>
<th style="text-align: left">
Expand Down Expand Up @@ -290,6 +302,24 @@ def _repr_html_(self):
</th>
<td>{self.client_location}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for unrestricted use only
</th>
<td>{self._unrestricted_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for surveillance use only
</th>
<td>{self._surveillance_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Relevant data releases
</th>
<td>{', '.join(self.releases)}</td>
</tr>
</tbody>
</table>
"""
Expand Down Expand Up @@ -337,6 +367,34 @@ def cross_metadata(self):
debug("drop 'phenotype' column, not used")
df.drop("phenotype", axis="columns", inplace=True)

# Identify the crosses sample set.
# Note: this sample set identifier is also hard-coded in `v3_wild()`.
crosses_sample_set = "AG1000G-X"

# If `_unrestricted_use_only` is `True`, then only return data if the crosses sample set has `unrestricted_use` set to `True`.
if (
self._unrestricted_use_only
and not self._sample_set_has_unrestricted_use(
sample_set=crosses_sample_set
)
):
# Remove all the data from the DataFrame and reset its index.
df = df.iloc[0:0].reset_index(drop=True)

# If `_surveillance_use_only` is `True`, then only return samples that have `is_surveillance` set to `True`.
if self._surveillance_use_only:
crosses_surveillance_flags_df = self._surveillance_flags(
sample_sets=[crosses_sample_set]
)
df = df.merge(
crosses_surveillance_flags_df[["sample_id", "is_surveillance"]],
on="sample_id",
how="left",
)
df = df[df["is_surveillance"]]
df = df.drop(columns=["is_surveillance"])

# Cache the cross metadata.
self._cache_cross_metadata = df

return self._cache_cross_metadata.copy()
Expand Down
52 changes: 33 additions & 19 deletions malariagen_data/anoph/aim_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,31 +138,45 @@ def aim_calls(
) -> xr.Dataset:
self._require_aim_analysis()

# Normalise parameters.
aims = self._prep_aims_param(aims=aims)
sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
# Prepare parameters.
prepared_aims = self._prep_aims_param(aims=aims)
del aims
prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
del sample_sets

# Access SNP calls and concatenate multiple sample sets and/or regions.
ly = []
for s in sample_sets_prepped:
y = self._aim_calls_dataset(
aims=aims,
sample_set=s,
prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
del sample_query

# Start a list of AIM calls Datasets, one for each sample set.
aim_calls_datasets = []

# For each sample set...
for sample_set in prepared_sample_sets:
# Get the AIM calls for all samples in the set, as a Xarray Dataset.
aim_calls_dataset = self._aim_calls_dataset(
aims=prepared_aims,
sample_set=sample_set,
)
ly.append(y)

# Add this Dataset to the list.
aim_calls_datasets.append(aim_calls_dataset)

# Concatenate data from multiple sample sets.
ds = simple_xarray_concat(ly, dim=DIM_SAMPLE)
ds = simple_xarray_concat(aim_calls_datasets, dim=DIM_SAMPLE)

# Handle sample query.
if sample_query is not None:
df_samples = self.sample_metadata(sample_sets=sample_sets_prepped)
# If there's a sample query...
if prepared_sample_query is not None:
# Get the relevant sample metadata.
df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)

# If there are no sample query options, then default to an empty dict.
sample_query_options = sample_query_options or {}
loc_samples = df_samples.eval(sample_query, **sample_query_options).values
if np.count_nonzero(loc_samples) == 0:
raise ValueError(f"No samples found for query {sample_query!r}")
ds = ds.isel(samples=loc_samples)

ds = self._filter_sample_dataset(
ds=ds,
df_samples=df_samples,
sample_query=prepared_sample_query,
sample_query_options=sample_query_options,
)

return ds

Expand Down
Loading
Loading