From fc6c2ba883bc363d9df3aad3bd6e2e5b15717479 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 7 Feb 2025 10:49:40 +0000
Subject: [PATCH 01/32] WIP: dev support for unrestricted_use_only,
surveillance_use_only
---
malariagen_data/af1.py | 18 ++++
malariagen_data/ag3.py | 18 ++++
malariagen_data/anoph/base.py | 10 ++
malariagen_data/anopheles.py | 4 +
tests/anoph/test_sample_metadata.py | 158 +++++++++++++++++++++++++++-
5 files changed, 207 insertions(+), 1 deletion(-)
diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py
index bc0a6141a..845944301 100644
--- a/malariagen_data/af1.py
+++ b/malariagen_data/af1.py
@@ -89,6 +89,8 @@ def __init__(
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
+ unrestricted_use_only=False,
+ surveillance_use_only=False,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
@@ -124,6 +126,8 @@ def __init__(
virtual_contigs=None,
gene_names=None,
inversion_tag_path=None,
+ unrestricted_use_only=unrestricted_use_only,
+ surveillance_use_only=surveillance_use_only,
)
def __repr__(self):
@@ -136,6 +140,8 @@ def __repr__(self):
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
+ f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
+ f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
@@ -201,6 +207,18 @@ def _repr_html_(self):
{self.client_location} |
+
+
+ Data filtered for unrestricted use only
+ |
+ {self._unrestricted_use_only} |
+
+
+
+ Data filtered for surveillance use only
+ |
+ {self._surveillance_use_only} |
+
"""
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
index 443c594b2..3dfdb4f9e 100644
--- a/malariagen_data/ag3.py
+++ b/malariagen_data/ag3.py
@@ -150,6 +150,8 @@ def __init__(
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
+ unrestricted_use_only=False,
+ surveillance_use_only=False,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
@@ -192,6 +194,8 @@ def __init__(
virtual_contigs=VIRTUAL_CONTIGS,
gene_names=GENE_NAMES,
inversion_tag_path=INVERSION_TAG_PATH,
+ unrestricted_use_only=unrestricted_use_only,
+ surveillance_use_only=surveillance_use_only,
)
# set up caches
@@ -218,6 +222,8 @@ def __repr__(self):
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
+ f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
+ f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
@@ -289,6 +295,18 @@ def _repr_html_(self):
{self.client_location} |
+
+
+ Data filtered for unrestricted use only
+ |
+ {self._unrestricted_use_only} |
+
+
+
+ Data filtered for surveillance use only
+ |
+ {self._surveillance_use_only} |
+
"""
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index a214e6fbe..3cbaf75b9 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -61,6 +61,8 @@ def __init__(
storage_options: Optional[Mapping] = None,
results_cache: Optional[str] = None,
tqdm_class=None,
+ unrestricted_use_only: Optional[bool] = False,
+ surveillance_use_only: Optional[bool] = False,
):
# If show_progress has not been specified, then determine the default.
if show_progress is None:
@@ -85,6 +87,8 @@ def __init__(
if tqdm_class is None:
tqdm_class = tqdm_auto
self._tqdm_class = tqdm_class
+ self._unrestricted_use_only = unrestricted_use_only
+ self._surveillance_use_only = surveillance_use_only
# Set up logging.
self._log = LoggingHelper(name=__name__, out=log, debug=debug)
@@ -406,6 +410,7 @@ def _read_sample_sets(self, *, single_release: str):
`terms_of_use_url` is the URL of the terms of use,
`release` is the identifier of the release containing the sample set,
`unrestricted_use` whether the sample set can be without restriction (e.g., if the terms of use have expired).
+ If `unrestricted_use_only` is set to `True` then only sample sets with `unrestricted_use` set to `True` will be included.
""",
)
def sample_sets(
@@ -428,6 +433,11 @@ def sample_sets(
except KeyError:
# Read and cache dataframe for performance.
df = self._read_sample_sets(single_release=release)
+
+ # If unrestricted_use_only, restrict to sample sets with unrestricted_use.
+ if "unrestricted_use" in df.columns and self._unrestricted_use_only:
+ df = df[df["unrestricted_use"].astype(bool)]
+
self._cache_sample_sets[release] = df
elif isinstance(release, Sequence):
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 4a2f63dfe..07856e62e 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -141,6 +141,8 @@ def __init__(
virtual_contigs: Optional[Mapping[str, Sequence[str]]],
gene_names: Optional[Mapping[str, str]],
inversion_tag_path: Optional[str],
+ unrestricted_use_only: Optional[bool],
+ surveillance_use_only: Optional[bool],
):
super().__init__(
url=url,
@@ -175,6 +177,8 @@ def __init__(
virtual_contigs=virtual_contigs,
gene_names=gene_names,
inversion_tag_path=inversion_tag_path,
+ unrestricted_use_only=unrestricted_use_only,
+ surveillance_use_only=surveillance_use_only,
)
@property
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
index e5b8ec8eb..46e84a8be 100644
--- a/tests/anoph/test_sample_metadata.py
+++ b/tests/anoph/test_sample_metadata.py
@@ -7,7 +7,7 @@
import plotly.graph_objects as go # type: ignore
import pytest
from pandas.testing import assert_frame_equal
-from pytest_cases import parametrize_with_cases
+from pytest_cases import parametrize_with_cases, case
from typeguard import suppress_type_checks
from malariagen_data import af1 as _af1
@@ -36,6 +36,73 @@ def ag3_sim_api(ag3_sim_fixture):
)
+@pytest.fixture
+def ag3_sim_unrestricted_use_only_api(ag3_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=ag3_sim_fixture.url,
+ config_path=_ag3.CONFIG_PATH,
+ major_version_number=_ag3.MAJOR_VERSION_NUMBER,
+ major_version_path=_ag3.MAJOR_VERSION_PATH,
+ pre=True,
+ aim_metadata_dtype={
+ "aim_species_fraction_arab": "float64",
+ "aim_species_fraction_colu": "float64",
+ "aim_species_fraction_colu_no2l": "float64",
+ "aim_species_gambcolu_arabiensis": object,
+ "aim_species_gambiae_coluzzii": object,
+ "aim_species": object,
+ },
+ taxon_colors=_ag3.TAXON_COLORS,
+ cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"],
+ unrestricted_use_only=True,
+ )
+
+
+@pytest.fixture
+def ag3_sim_surveillance_use_only_api(ag3_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=ag3_sim_fixture.url,
+ config_path=_ag3.CONFIG_PATH,
+ major_version_number=_ag3.MAJOR_VERSION_NUMBER,
+ major_version_path=_ag3.MAJOR_VERSION_PATH,
+ pre=True,
+ aim_metadata_dtype={
+ "aim_species_fraction_arab": "float64",
+ "aim_species_fraction_colu": "float64",
+ "aim_species_fraction_colu_no2l": "float64",
+ "aim_species_gambcolu_arabiensis": object,
+ "aim_species_gambiae_coluzzii": object,
+ "aim_species": object,
+ },
+ taxon_colors=_ag3.TAXON_COLORS,
+ cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"],
+ surveillance_use_only=True,
+ )
+
+
+@pytest.fixture
+def ag3_sim_unrestricted_surveillance_use_only_api(ag3_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=ag3_sim_fixture.url,
+ config_path=_ag3.CONFIG_PATH,
+ major_version_number=_ag3.MAJOR_VERSION_NUMBER,
+ major_version_path=_ag3.MAJOR_VERSION_PATH,
+ pre=True,
+ aim_metadata_dtype={
+ "aim_species_fraction_arab": "float64",
+ "aim_species_fraction_colu": "float64",
+ "aim_species_fraction_colu_no2l": "float64",
+ "aim_species_gambcolu_arabiensis": object,
+ "aim_species_gambiae_coluzzii": object,
+ "aim_species": object,
+ },
+ taxon_colors=_ag3.TAXON_COLORS,
+ cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"],
+ unrestricted_use_only=True,
+ surveillance_use_only=True,
+ )
+
+
@pytest.fixture
def af1_sim_api(af1_sim_fixture):
return AnophelesSampleMetadata(
@@ -48,6 +115,46 @@ def af1_sim_api(af1_sim_fixture):
)
+@pytest.fixture
+def af1_sim_unrestricted_use_only_api(af1_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=af1_sim_fixture.url,
+ config_path=_af1.CONFIG_PATH,
+ major_version_number=_af1.MAJOR_VERSION_NUMBER,
+ major_version_path=_af1.MAJOR_VERSION_PATH,
+ pre=False,
+ taxon_colors=_af1.TAXON_COLORS,
+ unrestricted_use_only=True,
+ )
+
+
+@pytest.fixture
+def af1_sim_surveillance_use_only_api(af1_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=af1_sim_fixture.url,
+ config_path=_af1.CONFIG_PATH,
+ major_version_number=_af1.MAJOR_VERSION_NUMBER,
+ major_version_path=_af1.MAJOR_VERSION_PATH,
+ pre=False,
+ taxon_colors=_af1.TAXON_COLORS,
+ surveillance_use_only=True,
+ )
+
+
+@pytest.fixture
+def af1_sim_unrestricted_surveillance_use_only_api(af1_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=af1_sim_fixture.url,
+ config_path=_af1.CONFIG_PATH,
+ major_version_number=_af1.MAJOR_VERSION_NUMBER,
+ major_version_path=_af1.MAJOR_VERSION_PATH,
+ pre=False,
+ taxon_colors=_af1.TAXON_COLORS,
+ unrestricted_use_only=True,
+ surveillance_use_only=True,
+ )
+
+
@pytest.fixture
def missing_metadata_api(fixture_dir):
# In this fixture, one of the sample sets (AG1000G-BF-A) has missing files
@@ -69,14 +176,58 @@ def missing_metadata_api(fixture_dir):
)
+@case
def case_ag3_sim(ag3_sim_fixture, ag3_sim_api):
return ag3_sim_fixture, ag3_sim_api
+@case
def case_af1_sim(af1_sim_fixture, af1_sim_api):
return af1_sim_fixture, af1_sim_api
+@case
+def case_ag3_sim_unrestricted_use_only(
+ ag3_sim_fixture, ag3_sim_unrestricted_use_only_api
+):
+ return ag3_sim_fixture, ag3_sim_unrestricted_use_only_api
+
+
+@case
+def case_af1_sim_unrestricted_use_only(
+ af1_sim_fixture, af1_sim_unrestricted_use_only_api
+):
+ return af1_sim_fixture, af1_sim_unrestricted_use_only_api
+
+
+@case
+def case_ag3_sim_surveillance_use_only(
+ ag3_sim_fixture, ag3_sim_surveillance_use_only_api
+):
+ return ag3_sim_fixture, ag3_sim_surveillance_use_only_api
+
+
+@case
+def case_af1_sim_surveillance_use_only(
+ af1_sim_fixture, af1_sim_surveillance_use_only_api
+):
+ return af1_sim_fixture, af1_sim_surveillance_use_only_api
+
+
+@case
+def case_ag3_sim_unrestricted_surveillance_use_only(
+ ag3_sim_fixture, ag3_sim_unrestricted_surveillance_use_only_api
+):
+ return ag3_sim_fixture, ag3_sim_unrestricted_surveillance_use_only_api
+
+
+@case
+def case_af1_sim_unrestricted_surveillance_use_only(
+ af1_sim_fixture, af1_sim_unrestricted_surveillance_use_only_api
+):
+ return af1_sim_fixture, af1_sim_unrestricted_surveillance_use_only_api
+
+
def general_metadata_expected_columns():
return {
"sample_id": "O",
@@ -117,6 +268,11 @@ def test_general_metadata_with_single_sample_set(fixture, api: AnophelesSampleMe
df_sample_sets = api.sample_sets().set_index("sample_set")
sample_count = df_sample_sets["sample_count"]
all_sample_sets = df_sample_sets.index.to_list()
+
+ # FIXME: we should probably add more sample sets to the fixtures to test combinations of unrestricted_use_only and surveillance_use_only.
+ if len(all_sample_sets) == 0:
+ pytest.skip("Skipping because there are no relevant sample sets to test.")
+
sample_set = random.choice(all_sample_sets)
# Call function to be tested.
From dfdd4e2f5d5b19825e0f00df337fe69f5056815d Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Tue, 18 Feb 2025 14:40:44 +0000
Subject: [PATCH 02/32] Add test sample sets for Af1 with
unrestricted_use_only. Add relevant_releases property.
---
malariagen_data/anoph/base.py | 5 +
tests/anoph/conftest.py | 22 ++++-
.../samples.admin_units.csv | 82 +++++++++++++++
.../samples.cohorts.csv | 82 +++++++++++++++
.../samples.taxa.csv | 82 +++++++++++++++
.../samples.admin_units.csv | 77 +++++++++++++++
.../samples.cohorts.csv | 77 +++++++++++++++
.../samples.taxa.csv | 77 +++++++++++++++
.../sequence_qc_stats.csv | 82 +++++++++++++++
.../sequence_qc_stats.csv | 77 +++++++++++++++
.../samples.meta.csv | 82 +++++++++++++++
.../surveillance.flags.csv | 99 +++++++++++++++++++
.../wgs_accession_data.csv | 82 +++++++++++++++
.../wgs_snp_data.csv | 82 +++++++++++++++
.../samples.meta.csv | 77 +++++++++++++++
.../surveillance.flags.csv | 80 +++++++++++++++
.../wgs_accession_data.csv | 77 +++++++++++++++
.../wgs_snp_data.csv | 77 +++++++++++++++
tests/anoph/test_sample_metadata.py | 10 +-
19 files changed, 1323 insertions(+), 6 deletions(-)
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.admin_units.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.cohorts.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.taxa.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.admin_units.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.cohorts.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.taxa.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/curation/1232-VO-KE-OCHOMO-VMF00044/sequence_qc_stats.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/curation/1235-VO-MZ-PAAIJMANS-VMF00094/sequence_qc_stats.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/samples.meta.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/surveillance.flags.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/wgs_accession_data.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/wgs_snp_data.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/samples.meta.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/surveillance.flags.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/wgs_accession_data.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/wgs_snp_data.csv
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index 3cbaf75b9..a68f70e5a 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -357,6 +357,11 @@ def releases(self) -> Tuple[str, ...]:
self._cache_releases = self._public_releases()
return self._cache_releases
+ @property
+ def relevant_releases(self) -> Tuple[str, ...]:
+ """Relevant data releases. When `unrestricted_use_only` is set to `True`, this excludes releases that contain restricted sample sets."""
+ return tuple(r for r in self.releases if not self.sample_sets(release=r).empty)
+
@property
def client_location(self) -> str:
details = self._client_details
diff --git a/tests/anoph/conftest.py b/tests/anoph/conftest.py
index 1564b029d..9d41d0736 100644
--- a/tests/anoph/conftest.py
+++ b/tests/anoph/conftest.py
@@ -1894,27 +1894,37 @@ def init_public_release_manifest(self):
"1229-VO-GH-DADZIE-VMF00095",
"1230-VO-GA-CF-AYALA-VMF00045",
"1231-VO-MULTI-WONDJI-VMF00043",
+ "1232-VO-KE-OCHOMO-VMF00044",
+ "1235-VO-MZ-PAAIJMANS-VMF00094",
],
- "sample_count": [26, 40, 32],
+ "sample_count": [26, 40, 32, 20, 20],
"study_id": [
"1229-VO-GH-DADZIE",
"1230-VO-MULTI-AYALA",
"1231-VO-MULTI-WONDJI",
+ "1232-VO-KE-OCHOMO",
+ "1235-VO-MZ-PAAIJMANS",
],
"study_url": [
"https://www.malariagen.net/network/where-we-work/1229-VO-GH-DADZIE",
"https://www.malariagen.net/network/where-we-work/1230-VO-MULTI-AYALA",
"https://www.malariagen.net/network/where-we-work/1231-VO-MULTI-WONDJI",
+ "https://www.malariagen.net/network/where-we-work/1232-VO-KE-OCHOMO",
+ "https://www.malariagen.net/network/where-we-work/1235-VO-MZ-PAAIJMANS",
],
"terms_of_use_expiry_date": [
"2025-06-01",
"2025-06-01",
"2025-06-01",
+ "2024-01-01", # Set to the past in order to test unrestricted_use_only.
+ "2024-01-01", # Set to the past in order to test unrestricted_use_only. (We need at least 2 sets.)
],
"terms_of_use_url": [
"https://malariagen.github.io/vector-data/af1/af1.0.html#terms-of-use",
"https://malariagen.github.io/vector-data/af1/af1.0.html#terms-of-use",
"https://malariagen.github.io/vector-data/af1/af1.0.html#terms-of-use",
+ "https://malariagen.github.io/vector-data/af1/af1.0.html#terms-of-use",
+ "https://malariagen.github.io/vector-data/af1/af1.0.html#terms-of-use",
],
}
)
@@ -2111,6 +2121,16 @@ def init_metadata(self):
release_path="v1.0",
sample_set="1231-VO-MULTI-WONDJI-VMF00043",
)
+ self.write_metadata(
+ release="1.0",
+ release_path="v1.0",
+ sample_set="1232-VO-KE-OCHOMO-VMF00044",
+ )
+ self.write_metadata(
+ release="1.0",
+ release_path="v1.0",
+ sample_set="1235-VO-MZ-PAAIJMANS-VMF00094",
+ )
def init_snp_sites(self):
path = self.bucket_path / "v1.0/snp_genotypes/all/sites/"
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.admin_units.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.admin_units.csv
new file mode 100644
index 000000000..94e28651e
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.admin_units.csv
@@ -0,0 +1,82 @@
+sample_id,country,country_ISO,adm1_name,adm1_ISO,adm2_name
+VBS17631,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17632,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17633,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17634,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17635,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17636,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17637,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17638,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17641,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17642,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17643,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17645,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17646,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17647,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17648,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17649,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17650,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17651,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17652,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17653,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17654,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17657,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17658,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17659,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17660,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17661,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17663,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17664,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17665,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17666,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17668,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17669,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17672,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17673,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17674,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17676,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17677,Kenya,KEN,Kisumu,KE-17,Nyando
+VBS17678,Kenya,KEN,Migori,KE-27,Migori
+VBS17680,Kenya,KEN,Migori,KE-27,Migori
+VBS17681,Kenya,KEN,Migori,KE-27,Migori
+VBS17697,Kenya,KEN,Migori,KE-27,Migori
+VBS17698,Kenya,KEN,Migori,KE-27,Migori
+VBS17679,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17689,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17700,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17701,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17702,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17703,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17704,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17705,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17706,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17707,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17709,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17710,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17711,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17714,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17715,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17716,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17717,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17718,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17719,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17720,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17721,Kenya,KEN,Bungoma,KE-03,Mt Elgon
+VBS17683,Kenya,KEN,Migori,KE-27,Migori
+VBS17684,Kenya,KEN,Migori,KE-27,Migori
+VBS17685,Kenya,KEN,Migori,KE-27,Migori
+VBS17699,Kenya,KEN,Migori,KE-27,Migori
+VBS17723,Kenya,KEN,Migori,KE-27,Migori
+VBS17724,Kenya,KEN,Migori,KE-27,Migori
+VBS17725,Kenya,KEN,Migori,KE-27,Migori
+VBS17726,Kenya,KEN,Migori,KE-27,Migori
+VBS17686,Kenya,KEN,Migori,KE-27,Migori
+VBS17687,Kenya,KEN,Migori,KE-27,Migori
+VBS17690,Kenya,KEN,Migori,KE-27,Migori
+VBS17691,Kenya,KEN,Migori,KE-27,Migori
+VBS17692,Kenya,KEN,Migori,KE-27,Migori
+VBS17693,Kenya,KEN,Migori,KE-27,Migori
+VBS17694,Kenya,KEN,Migori,KE-27,Migori
+VBS17695,Kenya,KEN,Migori,KE-27,Migori
+VBS17696,Kenya,KEN,Migori,KE-27,Migori
+VBS17722,Kenya,KEN,Migori,KE-27,Migori
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.cohorts.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.cohorts.csv
new file mode 100644
index 000000000..9bc70a36f
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.cohorts.csv
@@ -0,0 +1,82 @@
+sample_id,country_ISO,adm1_name,adm1_ISO,adm2_name,taxon,cohort_admin1_year,cohort_admin1_month,cohort_admin2_year,cohort_admin2_month
+VBS17631,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17632,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17633,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17634,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17635,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17636,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17637,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17638,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17641,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17642,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17643,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17645,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17646,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17647,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17648,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17649,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17650,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17651,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17652,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17653,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17654,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17657,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17658,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17659,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17660,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17661,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17663,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17664,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17665,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17666,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17668,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17669,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17672,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17673,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17674,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17676,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17677,KEN,Kisumu,KE-17,Nyando,funestus,KE-17_fune_2014,KE-17_fune_2014_06,KE-17_Nyando_fune_2014,KE-17_Nyando_fune_2014_06
+VBS17678,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17679,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17680,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17681,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17683,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17684,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17685,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17686,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17687,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17689,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17690,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17691,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17692,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17693,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17694,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17695,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17696,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17697,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17698,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17699,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17700,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17701,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17702,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17703,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17704,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17705,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17706,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17707,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17709,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17710,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17711,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17714,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17715,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17716,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17717,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17718,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17719,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17720,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17721,KEN,Bungoma,KE-03,Mt Elgon,funestus,KE-03_fune_2016,KE-03_fune_2016_10,KE-03_Mt-Elgon_fune_2016,KE-03_Mt-Elgon_fune_2016_10
+VBS17722,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17723,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17724,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17725,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
+VBS17726,KEN,Migori,KE-27,Migori,funestus,KE-27_fune_2016,KE-27_fune_2016_10,KE-27_Migori_fune_2016,KE-27_Migori_fune_2016_10
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.taxa.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.taxa.csv
new file mode 100644
index 000000000..0c1a6827d
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1232-VO-KE-OCHOMO-VMF00044/samples.taxa.csv
@@ -0,0 +1,82 @@
+sample_id,taxon
+VBS17631,funestus
+VBS17632,funestus
+VBS17633,funestus
+VBS17634,funestus
+VBS17635,funestus
+VBS17636,funestus
+VBS17637,funestus
+VBS17638,funestus
+VBS17641,funestus
+VBS17642,funestus
+VBS17643,funestus
+VBS17645,funestus
+VBS17646,funestus
+VBS17647,funestus
+VBS17648,funestus
+VBS17649,funestus
+VBS17650,funestus
+VBS17651,funestus
+VBS17652,funestus
+VBS17653,funestus
+VBS17654,funestus
+VBS17657,funestus
+VBS17658,funestus
+VBS17659,funestus
+VBS17660,funestus
+VBS17661,funestus
+VBS17663,funestus
+VBS17664,funestus
+VBS17665,funestus
+VBS17666,funestus
+VBS17668,funestus
+VBS17669,funestus
+VBS17672,funestus
+VBS17673,funestus
+VBS17674,funestus
+VBS17676,funestus
+VBS17677,funestus
+VBS17678,funestus
+VBS17679,funestus
+VBS17680,funestus
+VBS17681,funestus
+VBS17683,funestus
+VBS17684,funestus
+VBS17685,funestus
+VBS17686,funestus
+VBS17687,funestus
+VBS17689,funestus
+VBS17690,funestus
+VBS17691,funestus
+VBS17692,funestus
+VBS17693,funestus
+VBS17694,funestus
+VBS17695,funestus
+VBS17696,funestus
+VBS17697,funestus
+VBS17698,funestus
+VBS17699,funestus
+VBS17700,funestus
+VBS17701,funestus
+VBS17702,funestus
+VBS17703,funestus
+VBS17704,funestus
+VBS17705,funestus
+VBS17706,funestus
+VBS17707,funestus
+VBS17709,funestus
+VBS17710,funestus
+VBS17711,funestus
+VBS17714,funestus
+VBS17715,funestus
+VBS17716,funestus
+VBS17717,funestus
+VBS17718,funestus
+VBS17719,funestus
+VBS17720,funestus
+VBS17721,funestus
+VBS17722,funestus
+VBS17723,funestus
+VBS17724,funestus
+VBS17725,funestus
+VBS17726,funestus
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.admin_units.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.admin_units.csv
new file mode 100644
index 000000000..9b389af0d
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.admin_units.csv
@@ -0,0 +1,77 @@
+sample_id,country,country_ISO,adm1_name,adm1_ISO,adm2_name
+VBS24095,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24096,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24097,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24098,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24101,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24104,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24105,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24117,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24118,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24119,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24120,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24121,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24122,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24123,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24124,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24125,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24126,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24127,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24128,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24129,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24130,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24132,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24135,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24137,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24138,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24141,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24142,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24143,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24144,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24145,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24147,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24148,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24149,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24150,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24152,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24153,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24154,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24155,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24156,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24157,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24159,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24160,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24161,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24162,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24163,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24164,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24165,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24166,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24167,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24168,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24169,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24170,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24171,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24172,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24173,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24174,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24175,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24176,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24177,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24178,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24179,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24180,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24181,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24182,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24183,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24184,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24185,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24186,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24187,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24188,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24189,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24190,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24191,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24192,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24193,Mozambique,MOZ,Maputo,MZ-L,Manhica
+VBS24194,Mozambique,MOZ,Maputo,MZ-L,Manhica
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.cohorts.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.cohorts.csv
new file mode 100644
index 000000000..6f18de9c9
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.cohorts.csv
@@ -0,0 +1,77 @@
+sample_id,country_ISO,adm1_name,adm1_ISO,adm2_name,taxon,cohort_admin1_year,cohort_admin1_month,cohort_admin2_year,cohort_admin2_month
+VBS24095,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24096,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24097,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24098,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24101,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24104,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24105,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24117,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24118,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24119,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24120,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24121,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24122,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24123,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24124,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24125,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24126,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24127,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24128,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24129,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24130,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24132,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24135,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24137,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24138,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24141,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24142,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24143,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24144,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24145,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24147,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24148,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24149,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24150,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24152,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24153,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24154,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24155,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24156,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24157,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24159,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24160,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24161,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24162,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24163,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24164,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24165,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24166,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24167,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24168,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24169,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24170,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24171,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24172,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24173,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24174,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24175,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24176,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24177,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24178,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24179,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24180,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24181,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24182,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24183,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_01,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_01
+VBS24184,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24185,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24186,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24187,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24188,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24189,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24190,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24191,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24192,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24193,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
+VBS24194,MOZ,Maputo,MZ-L,Manhica,funestus,MZ-L_fune_2018,MZ-L_fune_2018_02,MZ-L_Manhica_fune_2018,MZ-L_Manhica_fune_2018_02
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.taxa.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.taxa.csv
new file mode 100644
index 000000000..e6ee64bcd
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/cohorts_20221129/1235-VO-MZ-PAAIJMANS-VMF00094/samples.taxa.csv
@@ -0,0 +1,77 @@
+sample_id,taxon
+VBS24095,funestus
+VBS24096,funestus
+VBS24097,funestus
+VBS24098,funestus
+VBS24101,funestus
+VBS24104,funestus
+VBS24105,funestus
+VBS24117,funestus
+VBS24118,funestus
+VBS24119,funestus
+VBS24120,funestus
+VBS24121,funestus
+VBS24122,funestus
+VBS24123,funestus
+VBS24124,funestus
+VBS24125,funestus
+VBS24126,funestus
+VBS24127,funestus
+VBS24128,funestus
+VBS24129,funestus
+VBS24130,funestus
+VBS24132,funestus
+VBS24135,funestus
+VBS24137,funestus
+VBS24138,funestus
+VBS24141,funestus
+VBS24142,funestus
+VBS24143,funestus
+VBS24144,funestus
+VBS24145,funestus
+VBS24147,funestus
+VBS24148,funestus
+VBS24149,funestus
+VBS24150,funestus
+VBS24152,funestus
+VBS24153,funestus
+VBS24154,funestus
+VBS24155,funestus
+VBS24156,funestus
+VBS24157,funestus
+VBS24159,funestus
+VBS24160,funestus
+VBS24161,funestus
+VBS24162,funestus
+VBS24163,funestus
+VBS24164,funestus
+VBS24165,funestus
+VBS24166,funestus
+VBS24167,funestus
+VBS24168,funestus
+VBS24169,funestus
+VBS24170,funestus
+VBS24171,funestus
+VBS24172,funestus
+VBS24173,funestus
+VBS24174,funestus
+VBS24175,funestus
+VBS24176,funestus
+VBS24177,funestus
+VBS24178,funestus
+VBS24179,funestus
+VBS24180,funestus
+VBS24181,funestus
+VBS24182,funestus
+VBS24183,funestus
+VBS24184,funestus
+VBS24185,funestus
+VBS24186,funestus
+VBS24187,funestus
+VBS24188,funestus
+VBS24189,funestus
+VBS24190,funestus
+VBS24191,funestus
+VBS24192,funestus
+VBS24193,funestus
+VBS24194,funestus
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/curation/1232-VO-KE-OCHOMO-VMF00044/sequence_qc_stats.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/curation/1232-VO-KE-OCHOMO-VMF00044/sequence_qc_stats.csv
new file mode 100644
index 000000000..22298275b
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/curation/1232-VO-KE-OCHOMO-VMF00044/sequence_qc_stats.csv
@@ -0,0 +1,82 @@
+sample_id,mean_cov,median_cov,modal_cov,mean_cov_2RL,median_cov_2RL,mode_cov_2RL,mean_cov_3RL,median_cov_3RL,mode_cov_3RL,mean_cov_X,median_cov_X,mode_cov_X,frac_gen_cov,divergence,contam_pct,contam_LLR
+VBS17631,27.6,26,25,27.43,26,25,27.57,26,25,28.59,25,24,0.937,0.01959,3.309,5536.271
+VBS17632,21.58,20,20,21.52,20,20,21.67,20,20,21.57,20,19,0.932,0.01995,4.44,4998.474
+VBS17633,21.0,19,19,20.86,20,19,21.0,20,19,21.71,19,18,0.933,0.01979,2.159,1911.306
+VBS17634,16.48,15,15,16.4,15,15,16.51,15,15,16.75,15,14,0.927,0.0202,2.496,1686.002
+VBS17635,32.21,31,31,32.16,31,31,32.3,31,31,32.09,30,30,0.937,0.01969,1.536,1880.206
+VBS17636,25.2,24,24,25.18,24,24,25.19,24,24,25.3,24,23,0.934,0.01977,1.246,1133.03
+VBS17637,26.02,25,24,25.96,25,25,26.06,25,25,26.13,24,24,0.935,0.01964,1.781,1886.931
+VBS17638,27.28,26,26,27.28,26,26,27.35,26,26,27.01,25,25,0.934,0.01978,1.754,2220.809
+VBS17641,23.31,22,22,23.23,22,22,23.39,22,22,23.37,21,21,0.934,0.01943,1.831,1567.822
+VBS17642,31.72,31,31,32.87,32,32,32.91,32,31,21.37,17,16,0.935,0.01984,1.024,1223.408
+VBS17643,14.19,13,12,14.17,13,12,14.18,13,12,14.34,13,12,0.925,0.02096,4.126,2856.472
+VBS17645,25.78,25,24,25.89,25,25,25.89,25,24,24.86,24,24,0.933,0.01944,2.935,3359.484
+VBS17646,23.36,22,22,23.32,22,22,23.35,22,22,23.63,22,21,0.931,0.01981,1.948,1903.665
+VBS17647,14.44,13,12,14.38,13,12,14.51,13,12,14.46,12,12,0.927,0.02042,3.177,2014.292
+VBS17648,32.5,31,31,32.46,31,31,32.65,31,31,32.08,30,30,0.935,0.01975,1.282,1610.687
+VBS17649,28.09,26,25,28.0,26,26,28.15,27,25,28.28,26,25,0.936,0.01982,2.445,3566.67
+VBS17650,25.59,24,24,25.54,24,24,25.69,24,24,25.42,23,23,0.933,0.01982,2.494,3155.809
+VBS17651,27.31,26,26,27.24,26,26,27.29,26,26,27.74,25,25,0.934,0.01939,2.961,3870.03
+VBS17652,29.98,29,28,29.91,29,28,29.98,29,28,30.35,28,28,0.934,0.0198,1.881,2434.238
+VBS17653,32.61,32,31,32.52,32,31,32.63,32,31,32.93,31,30,0.938,0.01966,1.654,1987.669
+VBS17654,33.66,33,32,33.56,33,32,33.64,33,32,34.25,32,32,0.938,0.01965,1.441,1934.526
+VBS17657,17.96,17,16,17.95,17,16,17.99,17,16,17.91,16,15,0.93,0.01997,2.275,1660.872
+VBS17658,28.25,27,27,28.23,27,27,28.28,27,27,28.18,26,26,0.934,0.01983,1.822,2366.192
+VBS17659,14.7,13,12,14.61,13,13,14.7,13,13,15.12,13,12,0.927,0.02051,3.234,2234.331
+VBS17660,29.11,28,27,29.04,28,28,29.09,28,28,29.52,27,27,0.935,0.0197,1.156,1186.054
+VBS17661,21.35,20,20,21.29,20,20,21.39,20,20,21.56,20,19,0.932,0.01965,1.88,1347.599
+VBS17663,27.71,27,26,27.69,27,26,27.78,27,26,27.52,26,25,0.934,0.0202,1.912,2253.497
+VBS17664,18.43,17,16,18.34,17,16,18.51,17,16,18.5,16,16,0.93,0.02007,2.624,2094.808
+VBS17665,20.16,19,19,20.07,19,19,20.21,19,19,20.41,19,18,0.931,0.0196,1.651,1188.595
+VBS17666,27.07,26,25,27.09,26,25,27.2,26,25,26.43,25,25,0.933,0.01967,1.607,1631.496
+VBS17668,22.54,21,21,22.45,21,21,22.59,21,21,22.76,21,20,0.932,0.01969,2.786,2358.669
+VBS17669,19.49,18,17,19.38,18,18,19.58,18,17,19.65,18,17,0.93,0.02007,2.299,1868.477
+VBS17672,14.89,13,12,14.74,13,12,14.94,13,12,15.41,13,12,0.928,0.02069,2.855,1883.832
+VBS17673,19.46,19,18,19.46,19,18,19.54,19,18,19.14,18,17,0.93,0.01975,2.723,2280.923
+VBS17674,21.14,20,19,21.08,20,19,21.27,20,19,20.96,19,18,0.93,0.01975,2.297,1860.104
+VBS17676,33.03,32,33,34.21,33,33,34.41,33,33,21.68,17,16,0.935,0.01989,1.068,1339.966
+VBS17677,23.37,22,21,23.22,22,21,23.51,22,22,23.58,21,21,0.934,0.01954,2.26,2208.403
+VBS17678,27.34,26,25,27.34,26,25,27.18,25,24,27.97,26,25,0.934,0.0195,3.688,5276.694
+VBS17679,20.82,20,19,20.76,20,19,20.98,20,19,20.44,19,19,0.934,0.01947,3.608,3405.729
+VBS17680,69.51,69,70,70.2,70,70,69.37,69,69,66.74,68,70,0.945,0.0191,1.102,2831.698
+VBS17681,64.7,65,65,65.34,65,65,64.76,65,65,61.34,63,64,0.945,0.01906,1.172,2896.359
+VBS17683,44.59,44,44,44.96,44,44,44.68,44,44,42.46,42,43,0.944,0.01914,1.743,3261.647
+VBS17684,38.25,38,37,38.85,38,38,38.2,37,37,35.52,36,36,0.94,0.01867,2.075,3881.38
+VBS17685,41.46,41,41,41.81,41,41,41.79,41,41,38.5,38,39,0.94,0.01901,1.331,2057.739
+VBS17686,59.85,60,61,60.61,60,61,60.04,60,60,55.45,57,59,0.943,0.01912,1.333,3230.891
+VBS17687,51.74,52,52,52.52,52,52,51.9,52,52,47.37,48,49,0.942,0.01917,1.388,2880.312
+VBS17689,59.38,59,58,60.09,59,59,59.16,58,58,56.8,57,59,0.944,0.01904,1.468,3864.948
+VBS17690,50.48,50,50,50.98,51,51,50.35,50,50,48.59,49,50,0.942,0.01909,1.309,2685.015
+VBS17691,70.52,71,72,71.37,72,72,70.42,71,71,66.83,69,72,0.946,0.01887,1.338,4108.085
+VBS17692,63.4,63,63,64.04,64,64,63.58,63,63,59.55,61,63,0.943,0.01882,1.212,2809.982
+VBS17693,55.83,56,57,56.25,56,57,55.88,56,56,53.64,55,57,0.943,0.01933,1.392,2865.184
+VBS17694,47.34,47,47,48.07,47,47,47.13,47,47,44.71,45,47,0.94,0.01944,1.71,3059.787
+VBS17695,51.58,51,51,51.89,51,52,51.74,51,51,49.47,49,50,0.944,0.01907,1.34,2721.966
+VBS17696,40.79,40,39,41.79,41,39,40.65,39,38,36.44,35,35,0.936,0.01916,1.595,2555.161
+VBS17697,41.73,41,41,42.05,41,41,41.7,41,41,40.29,40,41,0.941,0.01888,1.227,1759.507
+VBS17698,38.63,37,37,38.49,37,37,38.64,37,37,39.29,36,36,0.937,0.01931,1.402,1868.901
+VBS17699,45.76,46,46,46.3,46,46,46.02,46,46,42.11,43,44,0.939,0.01922,1.296,2097.78
+VBS17700,38.57,37,37,38.54,37,37,38.55,37,36,38.86,37,36,0.937,0.01909,1.489,2365.357
+VBS17701,25.28,24,24,25.23,24,24,25.44,24,24,24.94,23,23,0.934,0.01917,1.724,1726.185
+VBS17702,52.23,52,52,52.74,52,52,52.54,52,52,48.55,49,50,0.94,0.01935,1.189,2275.677
+VBS17703,28.68,28,27,28.89,28,28,28.71,28,27,27.52,27,27,0.939,0.01884,3.267,5414.473
+VBS17704,29.07,28,28,29.28,28,28,28.91,28,27,28.65,27,27,0.934,0.01917,2.05,2602.453
+VBS17705,44.04,43,43,44.08,43,43,44.39,43,43,42.48,41,42,0.939,0.0191,1.172,1747.602
+VBS17706,31.5,30,30,31.43,30,30,31.45,30,30,32.05,30,30,0.937,0.01898,1.64,2113.249
+VBS17707,31.86,31,30,31.86,31,30,31.87,31,30,31.85,30,30,0.936,0.01918,1.428,1478.911
+VBS17709,43.6,43,43,43.98,44,43,43.98,43,43,40.27,40,41,0.938,0.01905,1.103,1620.9
+VBS17710,35.92,34,34,35.65,34,34,36.07,34,34,36.66,34,33,0.939,0.01902,1.732,2755.965
+VBS17711,33.05,32,32,33.01,32,32,33.03,32,31,33.31,32,32,0.939,0.01901,1.519,1902.196
+VBS17714,18.77,17,16,18.73,17,16,18.95,17,16,18.26,16,14,0.931,0.01999,2.524,1896.088
+VBS17715,61.35,61,62,62.0,62,62,61.63,61,62,57.13,58,61,0.946,0.01929,1.478,3794.331
+VBS17716,31.99,31,31,32.57,32,31,31.99,31,31,29.17,28,29,0.936,0.01913,1.564,1966.89
+VBS17717,41.57,41,41,41.96,42,42,41.63,41,41,39.41,39,41,0.942,0.0191,1.513,2604.966
+VBS17718,45.33,45,45,45.98,46,45,45.14,45,44,42.98,43,44,0.942,0.0187,1.592,3020.652
+VBS17719,44.87,45,45,45.34,45,45,45.03,44,45,41.96,43,44,0.941,0.01926,1.462,2417.077
+VBS17720,37.63,37,37,37.78,37,37,37.68,37,37,36.74,36,37,0.938,0.01896,1.601,2459.302
+VBS17721,57.14,57,57,57.83,58,58,57.14,57,57,53.88,54,56,0.944,0.01905,1.372,3375.882
+VBS17722,62.04,62,62,62.65,62,62,61.95,61,61,59.49,60,62,0.945,0.01888,1.315,3181.964
+VBS17723,24.3,24,23,24.53,24,23,24.4,24,23,22.83,22,22,0.934,0.01887,2.126,2047.114
+VBS17724,26.29,25,25,26.42,25,25,26.56,25,25,24.6,23,23,0.937,0.01864,2.192,2469.586
+VBS17725,37.6,37,37,37.98,37,37,37.8,37,37,34.96,35,35,0.938,0.01923,1.331,1620.625
+VBS17726,56.13,56,56,56.86,56,56,56.01,56,56,53.13,54,55,0.943,0.01925,1.576,3592.906
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/curation/1235-VO-MZ-PAAIJMANS-VMF00094/sequence_qc_stats.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/curation/1235-VO-MZ-PAAIJMANS-VMF00094/sequence_qc_stats.csv
new file mode 100644
index 000000000..392faeada
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/curation/1235-VO-MZ-PAAIJMANS-VMF00094/sequence_qc_stats.csv
@@ -0,0 +1,77 @@
+sample_id,mean_cov,median_cov,modal_cov,mean_cov_2RL,median_cov_2RL,mode_cov_2RL,mean_cov_3RL,median_cov_3RL,mode_cov_3RL,mean_cov_X,median_cov_X,mode_cov_X,frac_gen_cov,divergence,contam_pct,contam_LLR
+VBS24095,13.58,13,13,13.56,13,13,13.67,13,13,13.29,12,12,0.93,0.01912,1.097,399.996
+VBS24096,15.41,15,14,15.44,15,14,15.44,15,14,15.13,14,14,0.927,0.0197,0.997,460.677
+VBS24097,12.74,12,11,12.7,12,11,12.89,12,11,12.37,11,10,0.929,0.01987,2.203,932.451
+VBS24098,13.09,12,12,13.1,12,12,13.17,12,12,12.72,12,11,0.927,0.02022,2.176,1270.775
+VBS24101,11.72,11,10,11.64,11,10,11.74,11,10,11.95,11,10,0.926,0.02054,1.298,519.782
+VBS24104,13.25,13,12,13.31,13,13,13.31,13,12,12.77,12,12,0.931,0.01964,1.152,438.176
+VBS24105,24.17,23,23,24.25,23,23,24.26,23,23,23.43,22,22,0.936,0.0187,1.573,1630.345
+VBS24117,40.56,39,38,40.54,39,38,40.97,39,38,39.11,38,37,0.944,0.01811,1.005,1223.043
+VBS24118,14.93,14,13,14.88,14,13,15.04,14,13,14.69,13,12,0.931,0.01907,0.812,325.528
+VBS24119,41.47,41,41,41.64,41,41,41.67,41,41,39.89,39,40,0.942,0.01825,0.908,1263.715
+VBS24120,39.85,39,39,40.01,39,39,40.11,39,39,38.06,37,38,0.943,0.01846,0.956,1196.472
+VBS24121,44.44,44,44,44.69,44,44,44.77,44,44,41.95,42,43,0.944,0.01809,1.052,1456.251
+VBS24122,37.35,36,36,37.71,37,36,37.6,37,36,34.61,34,34,0.94,0.01861,1.148,1435.382
+VBS24123,47.49,47,47,47.87,47,47,47.95,47,47,43.93,45,46,0.944,0.0181,1.035,1487.318
+VBS24124,37.44,36,36,37.34,37,36,37.69,37,36,37.04,35,35,0.947,0.01821,3.017,8618.44
+VBS24125,38.85,36,34,38.74,37,35,39.17,37,35,38.12,35,33,0.941,0.01795,0.925,1184.662
+VBS24126,46.37,45,45,46.64,45,45,46.09,45,45,46.16,44,44,0.941,0.01882,0.763,1194.4
+VBS24127,37.75,36,34,37.71,36,34,38.01,36,35,36.93,34,33,0.941,0.01861,0.999,1185.909
+VBS24128,29.67,27,25,29.39,27,25,29.94,27,25,29.99,26,24,0.941,0.01797,0.914,819.404
+VBS24129,36.37,36,37,36.69,37,37,36.21,36,37,35.46,35,37,0.939,0.01816,0.843,877.858
+VBS24130,32.71,31,30,32.75,31,30,32.7,31,30,32.5,30,29,0.941,0.01812,0.862,710.685
+VBS24132,38.74,38,38,39.08,38,38,38.82,38,37,36.83,37,38,0.939,0.01836,0.92,1164.515
+VBS24135,45.03,42,38,44.61,42,38,45.37,42,38,45.71,41,36,0.941,0.01857,0.842,1139.028
+VBS24137,33.99,32,30,33.63,32,30,33.87,32,30,36.16,32,30,0.942,0.01834,3.643,9878.213
+VBS24138,41.79,39,37,41.69,39,37,41.83,39,37,42.05,38,35,0.943,0.01836,0.942,1054.288
+VBS24141,46.75,45,44,46.94,46,45,47.02,46,45,44.84,43,43,0.942,0.01858,0.877,1341.67
+VBS24142,40.5,40,40,40.71,40,40,40.83,40,40,38.21,38,39,0.941,0.01827,1.002,1423.146
+VBS24143,32.58,31,30,32.47,31,30,32.58,31,31,33.14,31,30,0.94,0.01841,0.929,896.843
+VBS24144,27.89,27,26,27.94,27,26,28.08,27,27,26.91,26,26,0.94,0.01798,1.257,1044.101
+VBS24145,46.07,44,41,46.08,44,41,46.67,44,42,43.66,41,39,0.943,0.01804,0.992,1307.938
+VBS24147,38.14,37,36,38.18,37,36,38.35,37,36,37.1,35,35,0.941,0.01856,0.886,1057.293
+VBS24148,33.37,31,30,32.91,31,30,33.71,32,30,34.27,31,29,0.943,0.01773,1.277,1464.178
+VBS24149,30.32,31,34,31.91,32,34,31.38,31,34,18.27,16,16,0.937,0.01822,1.231,1137.891
+VBS24150,43.07,44,49,45.28,46,49,44.59,45,49,26.22,23,23,0.942,0.0182,1.015,1324.911
+VBS24152,34.6,33,30,34.85,33,32,34.03,32,30,35.62,33,32,0.94,0.01833,1.197,1139.607
+VBS24153,26.53,23,17,27.82,24,18,27.13,23,17,17.55,14,10,0.933,0.01874,1.354,1203.428
+VBS24154,33.37,32,25,34.97,34,38,34.2,33,34,22.33,19,17,0.938,0.01869,1.111,1135.009
+VBS24155,44.29,44,44,44.55,44,44,44.58,44,44,41.85,42,42,0.943,0.01797,1.027,1272.539
+VBS24156,30.89,30,34,32.44,32,34,31.77,31,33,19.88,17,16,0.939,0.01843,1.389,1342.29
+VBS24157,46.32,46,46,46.71,46,46,46.73,46,46,42.82,43,45,0.943,0.01786,1.009,1314.312
+VBS24159,32.25,31,22,33.91,34,39,33.1,32,38,20.78,18,18,0.937,0.01864,1.109,1067.512
+VBS24160,38.4,38,39,38.64,38,39,38.5,38,39,36.93,37,38,0.943,0.01784,1.044,1176.992
+VBS24161,38.85,39,44,41.0,42,45,40.08,40,44,23.41,21,21,0.941,0.01814,1.226,1337.379
+VBS24162,31.24,31,33,32.81,33,34,32.35,32,33,19.29,17,16,0.939,0.01862,1.027,992.652
+VBS24163,31.73,29,24,33.37,31,26,32.55,30,25,20.48,17,13,0.938,0.01805,1.198,1128.01
+VBS24164,41.28,41,41,41.62,41,41,41.32,41,41,39.5,39,40,0.941,0.01881,0.779,998.365
+VBS24165,40.39,40,41,42.17,41,41,42.14,41,41,24.71,21,20,0.941,0.01866,0.894,1101.912
+VBS24166,36.78,37,40,38.51,39,40,38.0,38,40,23.45,20,19,0.939,0.01838,1.217,1230.887
+VBS24167,29.41,29,28,29.64,29,28,29.66,29,28,27.33,27,27,0.939,0.01856,0.944,819.739
+VBS24168,43.57,43,42,43.91,43,42,43.89,43,42,40.66,40,41,0.943,0.01819,0.986,1327.565
+VBS24169,36.75,37,40,38.49,39,40,38.17,38,39,22.58,19,19,0.941,0.01791,1.064,1114.605
+VBS24170,39.45,39,46,41.41,42,47,40.77,41,45,24.69,22,22,0.939,0.01809,1.102,1323.763
+VBS24171,36.52,36,36,36.92,36,36,36.84,36,36,33.33,33,34,0.94,0.01834,0.987,1282.503
+VBS24172,33.55,34,36,35.23,35,36,34.8,34,36,20.28,18,17,0.938,0.01819,1.157,1189.183
+VBS24173,38.84,39,40,40.48,40,40,40.74,40,39,23.35,20,19,0.942,0.01818,0.974,1130.073
+VBS24174,40.17,38,36,40.12,38,37,40.45,38,37,39.3,36,35,0.944,0.01808,0.917,1077.389
+VBS24175,39.64,39,39,40.04,39,39,40.0,39,39,36.34,36,37,0.943,0.01813,1.043,1418.836
+VBS24176,41.22,40,39,41.15,40,39,41.41,40,39,40.86,38,37,0.944,0.01829,0.81,1067.654
+VBS24177,33.43,32,32,33.76,33,32,33.46,32,32,31.69,31,31,0.94,0.01854,1.22,1254.833
+VBS24178,34.77,35,40,36.64,37,40,35.87,36,40,21.21,19,19,0.938,0.01858,1.171,1327.225
+VBS24179,37.53,34,17,39.49,37,19,38.33,35,15,24.89,21,16,0.938,0.01869,1.042,1164.221
+VBS24180,35.19,35,36,36.73,36,36,36.84,36,36,21.17,18,17,0.939,0.01866,1.075,1238.355
+VBS24181,39.11,38,36,39.27,38,37,39.34,38,37,37.46,36,35,0.943,0.01856,0.854,963.472
+VBS24182,44.43,44,44,45.02,44,44,44.71,44,44,40.53,41,42,0.942,0.01857,0.986,1439.162
+VBS24183,34.03,33,39,35.83,35,41,34.7,34,39,22.67,19,19,0.939,0.01842,1.171,1071.532
+VBS24184,46.01,45,45,46.29,46,45,46.4,45,45,43.12,43,43,0.944,0.01814,0.901,1228.93
+VBS24185,38.91,37,36,38.92,37,36,39.12,37,36,38.0,36,34,0.941,0.01867,0.926,1046.591
+VBS24186,41.1,40,40,41.36,41,40,41.21,40,40,39.5,39,39,0.945,0.01828,1.228,2021.875
+VBS24187,41.27,39,38,41.24,39,38,41.25,39,38,41.5,38,36,0.942,0.01894,0.846,1092.727
+VBS24188,41.8,42,42,42.27,42,42,41.91,42,42,39.09,40,41,0.942,0.01875,0.99,1551.157
+VBS24189,36.86,37,38,37.12,37,38,36.77,36,37,36.0,36,37,0.942,0.01804,1.313,1731.755
+VBS24190,31.09,30,30,31.15,30,30,31.4,30,30,29.63,29,29,0.942,0.01808,1.06,1030.157
+VBS24191,49.79,49,49,50.29,49,49,50.2,49,49,45.78,46,47,0.944,0.01838,0.887,1200.903
+VBS24192,42.5,43,46,43.03,44,46,42.1,43,46,41.55,42,46,0.94,0.01863,0.992,1297.628
+VBS24193,43.54,43,43,44.05,43,43,43.74,43,43,40.36,40,42,0.941,0.0188,0.922,1452.218
+VBS24194,40.89,41,41,41.25,41,41,41.01,41,41,38.72,39,40,0.942,0.01796,0.991,1143.829
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/samples.meta.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/samples.meta.csv
new file mode 100644
index 000000000..593e00de7
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/samples.meta.csv
@@ -0,0 +1,82 @@
+sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call
+VBS17631,1232_KEN_D0268A_A01,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17632,1232_KEN_D0268A_B01,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17633,1232_KEN_D0268A_C01,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17634,1232_KEN_D0268A_D01,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17635,1232_KEN_D0268A_E01,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17636,1232_KEN_D0268A_F01,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17637,1232_KEN_D0268A_G01,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17638,1232_KEN_D0268A_H01,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17641,1232_KEN_D0268A_C02,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17642,1232_KEN_D0268A_D02,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,M
+VBS17643,1232_KEN_D0268A_E02,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17645,1232_KEN_D0268A_G02,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17646,1232_KEN_D0268A_H02,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17647,1232_KEN_D0268A_A03,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17648,1232_KEN_D0268A_B03,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17649,1232_KEN_D0268A_C03,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17650,1232_KEN_D0268A_D03,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17651,1232_KEN_D0268A_E03,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17652,1232_KEN_D0268A_F03,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17653,1232_KEN_D0268A_G03,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17654,1232_KEN_D0268A_H03,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17657,1232_KEN_D0268A_C04,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17658,1232_KEN_D0268A_D04,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17659,1232_KEN_D0268A_E04,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17660,1232_KEN_D0268A_F04,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17661,1232_KEN_D0268A_G04,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17663,1232_KEN_D0268A_A05,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17664,1232_KEN_D0268A_B05,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17665,1232_KEN_D0268A_C05,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17666,1232_KEN_D0268A_D05,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17668,1232_KEN_D0268A_F05,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17669,1232_KEN_D0268A_G05,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17672,1232_KEN_D0268A_B06,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17673,1232_KEN_D0268A_C06,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17674,1232_KEN_D0268A_D06,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17676,1232_KEN_D0268A_F06,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,M
+VBS17677,1232_KEN_D0268A_G06,Eric Ochomo,Kenya,Ahero,2014,6,-0.174,34.920,F
+VBS17678,1232_KEN_D0268A_H06,Eric Ochomo,Kenya,Uriri,2016,10,-0.952,34.513,F
+VBS17679,1232_KEN_D0268A_A07,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17680,1232_KEN_D0268A_B07,Eric Ochomo,Kenya,Uriri,2016,10,-0.952,34.513,F
+VBS17681,1232_KEN_D0268A_C07,Eric Ochomo,Kenya,Uriri,2016,10,-0.952,34.513,F
+VBS17683,1232_KEN_D0268A_E07,Eric Ochomo,Kenya,Awendo,2016,10,-0.901,34.533,F
+VBS17684,1232_KEN_D0268A_F07,Eric Ochomo,Kenya,Awendo,2016,10,-0.901,34.533,F
+VBS17685,1232_KEN_D0268A_G07,Eric Ochomo,Kenya,Awendo,2016,10,-0.901,34.533,F
+VBS17686,1232_KEN_D0268A_H07,Eric Ochomo,Kenya,Sumba,2016,10,-0.686,34.596,F
+VBS17687,1232_KEN_D0268A_A08,Eric Ochomo,Kenya,Sumba,2016,10,-0.686,34.596,F
+VBS17689,1232_KEN_D0268A_C08,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17690,1232_KEN_D0268A_D08,Eric Ochomo,Kenya,Sumba,2016,10,-0.686,34.596,F
+VBS17691,1232_KEN_D0268A_E08,Eric Ochomo,Kenya,Sumba,2016,10,-0.686,34.596,F
+VBS17692,1232_KEN_D0268A_F08,Eric Ochomo,Kenya,Sumba,2016,10,-0.686,34.596,F
+VBS17693,1232_KEN_D0268A_G08,Eric Ochomo,Kenya,Sumba,2016,10,-0.686,34.596,F
+VBS17694,1232_KEN_D0268A_H08,Eric Ochomo,Kenya,Sumba,2016,10,-0.686,34.596,F
+VBS17695,1232_KEN_D0268A_A09,Eric Ochomo,Kenya,Sumba,2016,10,-0.686,34.596,F
+VBS17696,1232_KEN_D0268A_B09,Eric Ochomo,Kenya,Sumba,2016,10,-0.686,34.596,F
+VBS17697,1232_KEN_D0268A_C09,Eric Ochomo,Kenya,Uriri,2016,10,-0.952,34.513,F
+VBS17698,1232_KEN_D0268A_D09,Eric Ochomo,Kenya,Uriri,2016,10,-0.952,34.513,F
+VBS17699,1232_KEN_D0268A_E09,Eric Ochomo,Kenya,Awendo,2016,10,-0.901,34.533,F
+VBS17700,1232_KEN_D0268A_F09,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17701,1232_KEN_D0268A_G09,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17702,1232_KEN_D0268A_H09,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17703,1232_KEN_D0268A_A10,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17704,1232_KEN_D0268A_B10,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17705,1232_KEN_D0268A_C10,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17706,1232_KEN_D0268A_D10,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17707,1232_KEN_D0268A_E10,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17709,1232_KEN_D0268A_G10,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17710,1232_KEN_D0268A_H10,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17711,1232_KEN_D0268A_A11,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17714,1232_KEN_D0268A_D11,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17715,1232_KEN_D0268A_E11,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17716,1232_KEN_D0268A_F11,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17717,1232_KEN_D0268A_G11,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17718,1232_KEN_D0268A_H11,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17719,1232_KEN_D0268A_A12,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17720,1232_KEN_D0268A_B12,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17721,1232_KEN_D0268A_C12,Eric Ochomo,Kenya,Uradi,2016,10,0.980,34.543,F
+VBS17722,1232_KEN_D0268A_D12,Eric Ochomo,Kenya,Nyamilu,2016,10,-1.074,34.391,F
+VBS17723,1232_KEN_D0268A_E12,Eric Ochomo,Kenya,Awendo,2016,10,-0.901,34.533,F
+VBS17724,1232_KEN_D0268A_F12,Eric Ochomo,Kenya,Awendo,2016,10,-0.901,34.533,F
+VBS17725,1232_KEN_D0269A_A01,Eric Ochomo,Kenya,Awendo,2016,10,-0.901,34.533,F
+VBS17726,1232_KEN_D0269A_B01,Eric Ochomo,Kenya,Awendo,2016,10,-0.901,34.533,F
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/surveillance.flags.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/surveillance.flags.csv
new file mode 100644
index 000000000..08046546e
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/surveillance.flags.csv
@@ -0,0 +1,99 @@
+sample_id,is_surveillance
+VBS17631,True
+VBS17632,True
+VBS17633,True
+VBS17634,True
+VBS17635,True
+VBS17636,True
+VBS17637,True
+VBS17638,True
+VBS17639,True
+VBS17640,True
+VBS17641,True
+VBS17642,True
+VBS17643,True
+VBS17644,True
+VBS17645,True
+VBS17646,True
+VBS17647,True
+VBS17648,True
+VBS17649,True
+VBS17650,True
+VBS17651,True
+VBS17652,True
+VBS17653,True
+VBS17654,True
+VBS17655,True
+VBS17656,True
+VBS17657,True
+VBS17658,True
+VBS17659,True
+VBS17660,True
+VBS17661,True
+VBS17662,True
+VBS17663,True
+VBS17664,True
+VBS17665,True
+VBS17666,True
+VBS17667,True
+VBS17668,True
+VBS17669,True
+VBS17670,True
+VBS17671,True
+VBS17672,True
+VBS17673,True
+VBS17674,True
+VBS17675,True
+VBS17676,True
+VBS17677,True
+VBS17678,True
+VBS17679,True
+VBS17680,True
+VBS17681,True
+VBS17682,True
+VBS17683,True
+VBS17684,True
+VBS17685,True
+VBS17686,True
+VBS17687,True
+VBS17688,True
+VBS17689,True
+VBS17690,True
+VBS17691,True
+VBS17692,True
+VBS17693,True
+VBS17694,True
+VBS17695,True
+VBS17696,True
+VBS17697,True
+VBS17698,True
+VBS17699,True
+VBS17700,True
+VBS17701,True
+VBS17702,True
+VBS17703,True
+VBS17704,True
+VBS17705,True
+VBS17706,True
+VBS17707,True
+VBS17708,True
+VBS17709,True
+VBS17710,True
+VBS17711,True
+VBS17712,True
+VBS17713,True
+VBS17714,True
+VBS17715,True
+VBS17716,True
+VBS17717,True
+VBS17718,True
+VBS17719,True
+VBS17720,True
+VBS17721,True
+VBS17722,True
+VBS17723,True
+VBS17724,True
+VBS17725,True
+VBS17726,True
+VBS17727,True
+VBS17728,True
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/wgs_accession_data.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/wgs_accession_data.csv
new file mode 100644
index 000000000..2cf34b5b4
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/wgs_accession_data.csv
@@ -0,0 +1,82 @@
+sample_id,run_ena
+VBS17631,"ERR2703732, ERR2810207, ERR2810232"
+VBS17632,"ERR2703733, ERR2810208, ERR2810233"
+VBS17633,"ERR2703734, ERR2810209, ERR2810234"
+VBS17634,"ERR2703757, ERR2798807, ERR2798833"
+VBS17635,"ERR2703735, ERR2810210, ERR2810235"
+VBS17636,"ERR2703758, ERR2798808, ERR2798834"
+VBS17637,"ERR2703759, ERR2798809, ERR2798835"
+VBS17638,"ERR2703760, ERR2798810, ERR2798836"
+VBS17641,"ERR2703761, ERR2798811, ERR2798837"
+VBS17642,"ERR2703762, ERR2798812, ERR2798838"
+VBS17643,"ERR2703737, ERR2810212, ERR2810237"
+VBS17645,"ERR2703785, ERR2810259, ERR2810284"
+VBS17646,"ERR2703763, ERR2798813, ERR2798839"
+VBS17647,"ERR2703764, ERR2798814, ERR2798840"
+VBS17648,"ERR2703765, ERR2798815, ERR2798841"
+VBS17649,"ERR2703766, ERR2798816, ERR2798842"
+VBS17650,"ERR2703738, ERR2810213, ERR2810238"
+VBS17651,"ERR2703767, ERR2798817, ERR2798843"
+VBS17652,"ERR2703739, ERR2810214, ERR2810239"
+VBS17653,"ERR2703740, ERR2810215, ERR2810240"
+VBS17654,"ERR2703768, ERR2798818, ERR2798844"
+VBS17657,"ERR2703770, ERR2798820, ERR2798846"
+VBS17658,"ERR2703742, ERR2810217, ERR2810242"
+VBS17659,"ERR2703771, ERR2798821, ERR2798847"
+VBS17660,"ERR2703772, ERR2798822, ERR2798848"
+VBS17661,"ERR2703773, ERR2798823, ERR2798849"
+VBS17663,"ERR2703774, ERR2798824, ERR2798850"
+VBS17664,"ERR2703775, ERR2798825, ERR2798851"
+VBS17665,"ERR2703776, ERR2798826, ERR2798852"
+VBS17666,"ERR2703744, ERR2810219, ERR2810244"
+VBS17668,"ERR2703745, ERR2810220, ERR2810245"
+VBS17669,"ERR2703778, ERR2798828, ERR2798854"
+VBS17672,"ERR2703747, ERR2810222, ERR2810247"
+VBS17673,"ERR2703748, ERR2810223, ERR2810248"
+VBS17674,"ERR2703780, ERR2798830, ERR2798856"
+VBS17676,"ERR2703781, ERR2798831, ERR2798857"
+VBS17677,"ERR2703782, ERR2798832, ERR2798858"
+VBS17678,"ERR2703786, ERR2810260, ERR2810285"
+VBS17679,"ERR2703787, ERR2810261, ERR2810286"
+VBS17680,"ERR2659229, ERR2659249, ERR2659269"
+VBS17681,"ERR2659236, ERR2659256, ERR2659276"
+VBS17683,"ERR2659240, ERR2659260, ERR2659280"
+VBS17684,"ERR2659239, ERR2659259, ERR2659279"
+VBS17685,"ERR2703750, ERR2810225, ERR2810250"
+VBS17686,"ERR2659232, ERR2659252, ERR2659272"
+VBS17687,"ERR2659238, ERR2659258, ERR2659278"
+VBS17689,"ERR2659223, ERR2659243, ERR2659263"
+VBS17690,"ERR2659231, ERR2659251, ERR2659271"
+VBS17691,"ERR2659237, ERR2659257, ERR2659277"
+VBS17692,"ERR2659228, ERR2659248, ERR2659268"
+VBS17693,"ERR2659222, ERR2659242, ERR2659262"
+VBS17694,"ERR2659241, ERR2659261, ERR2659281"
+VBS17695,"ERR2659230, ERR2659250, ERR2659270"
+VBS17696,"ERR2703790, ERR2810264, ERR2810289"
+VBS17697,"ERR2703751, ERR2810226, ERR2810251"
+VBS17698,"ERR2703791, ERR2810265, ERR2810290"
+VBS17699,"ERR2703752, ERR2810227, ERR2810252"
+VBS17700,"ERR2703792, ERR2810266, ERR2810291"
+VBS17701,"ERR2703793, ERR2810267, ERR2810292"
+VBS17702,"ERR2703753, ERR2810228, ERR2810253"
+VBS17703,
+VBS17704,"ERR2703794, ERR2810268, ERR2810293"
+VBS17705,"ERR2703795, ERR2810269, ERR2810294"
+VBS17706,"ERR2703796, ERR2810270, ERR2810295"
+VBS17707,"ERR2703797, ERR2810271, ERR2810296"
+VBS17709,"ERR2703799, ERR2810273, ERR2810298"
+VBS17710,"ERR2703754, ERR2810229, ERR2810254"
+VBS17711,"ERR2703800, ERR2810274, ERR2810299"
+VBS17714,"ERR2703803, ERR2810277, ERR2810302"
+VBS17715,"ERR2659234, ERR2659254, ERR2659274"
+VBS17716,"ERR2703804, ERR2810278, ERR2810303"
+VBS17717,"ERR2659235, ERR2659255, ERR2659275"
+VBS17718,"ERR2659226, ERR2659246, ERR2659266"
+VBS17719,"ERR2659233, ERR2659253, ERR2659273"
+VBS17720,"ERR2703755, ERR2810230, ERR2810255"
+VBS17721,"ERR2659225, ERR2659245, ERR2659265"
+VBS17722,"ERR2659227, ERR2659247, ERR2659267"
+VBS17723,"ERR2703805, ERR2810279, ERR2810304"
+VBS17724,
+VBS17725,"ERR2703756, ERR2810231, ERR2810256"
+VBS17726,"ERR2659224, ERR2659244, ERR2659264"
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/wgs_snp_data.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/wgs_snp_data.csv
new file mode 100644
index 000000000..5c3b975f7
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1232-VO-KE-OCHOMO-VMF00044/wgs_snp_data.csv
@@ -0,0 +1,82 @@
+sample_id,alignments_bam,alignments_bam_md5,snp_genotypes_vcf,snp_genotypes_vcf_md5,snp_genotypes_zarr,snp_genotypes_zarr_md5,pipeline_version
+VBS17631,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17631.fixmate.bam,52189137f2c54d586a99f297a5bf5d16,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17631.vcf.gz,e7c92e916e2bd2fe34ae3ecd0c0ae3e5,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17631.zarr.zip,ac96d8bd07e608df581e6f22fe0b996f,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17632,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17632.fixmate.bam,37c4d678dd2f671aa48e77e9377cdc9c,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17632.vcf.gz,e490def719f07acb609d6a0bcab7e43c,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17632.zarr.zip,d44a104907f490b55ff9023d40d27701,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17633,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17633.fixmate.bam,68a46d57d416b1b5ac5c6cb51777c617,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17633.vcf.gz,425a7a2520f1538404b05eeae6ee6c05,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17633.zarr.zip,038debae76b6e06961122b28e484eddf,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17634,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17634.fixmate.bam,e2d0cca96fb37ec3f9089688b04cc642,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17634.vcf.gz,b0c0d8f801e4904b741daf9aab9a6fbe,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17634.zarr.zip,91f4c6c5d5913669046a02f3a9419386,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17635,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17635.fixmate.bam,5eb96643601bc33ee897ab5386ea229e,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17635.vcf.gz,e8ec946e55e1c2484327640c4d65635f,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17635.zarr.zip,616659258f3c6b54d0d28e2a770a83ad,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17636,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17636.fixmate.bam,ee6c30798ceb81852141dfd04e928372,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17636.vcf.gz,e9a268104cb57865f2dfaf1fd46993f2,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17636.zarr.zip,f43751e39b9deba625dc89887bf3d553,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17637,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17637.fixmate.bam,750d7f9930941df015b5a9296bf5c194,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17637.vcf.gz,a7cdfa255527e30709b67514078bfbf4,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17637.zarr.zip,00d9be56e74b492e52c4961dcc1d17c7,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17638,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17638.fixmate.bam,36635f852608d5d235ee0fd5bbfe52bd,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17638.vcf.gz,cf5a2cf4dc88ad691a491d2e50f181a6,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17638.zarr.zip,5151064ad2f28c3ddd0a68f5ed87e2f7,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17641,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17641.fixmate.bam,414dbc6f55a5f6afdef217c9b06aca13,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17641.vcf.gz,9ab70f8e41c2e6e3f30b583da49752d2,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17641.zarr.zip,bd00ab973ee52bf2329b3029e1a5bc91,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17642,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17642.fixmate.bam,e827c12a5f22102be6a34a9757b0d27f,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17642.vcf.gz,857c2d1887e5c433dbf24533891c3076,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17642.zarr.zip,dd5c464fd81efeba8276116257a58151,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17643,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17643.fixmate.bam,1bcdd86862152e4876ca6563a12ad09e,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17643.vcf.gz,30f3a95a619d93351336945fd352c92e,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17643.zarr.zip,c7dabad0acef2a583f743afca192e446,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17645,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17645.fixmate.bam,14547cf00ce31790d8376472cddc168c,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17645.vcf.gz,d5d33f8c0a74590e71fe1557664c0fee,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17645.zarr.zip,6de30a91ca0f3efb8e970329d3e146e0,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17646,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17646.fixmate.bam,5c609e1f6a6ac92798872bf0cf05c41a,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17646.vcf.gz,bb72cc781e5bf43968f77a6c48902c74,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17646.zarr.zip,85179b8c746a7f72fc6ab65ede0831b0,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17647,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17647.fixmate.bam,04d10e034bf6dfeefb1703346258ecf7,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17647.vcf.gz,3096f69bb1b48adda1093bce42e28326,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17647.zarr.zip,3ba7fa22abcdd9b3eb31748531a23379,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17648,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17648.fixmate.bam,77e4e200bb07bfd7e0e9b09c40317595,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17648.vcf.gz,88490e47314fbdd4565e513b1445c0d3,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17648.zarr.zip,9deabf4139afb94c366e45d941d44cff,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17649,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17649.fixmate.bam,15f8a44ea5a1bf58745738a1bfeeab91,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17649.vcf.gz,950e13951fda8103e1b95f2f336f1287,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17649.zarr.zip,f70e82c0e9bf4c0f89aa65d78018b1b3,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17650,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17650.fixmate.bam,85fc1f8a3492b42f112803cea7b635d4,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17650.vcf.gz,1f199a5f7dfdf32e3da3226f22e0a425,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17650.zarr.zip,4252727fdf1f61ada67da04e3b268290,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17651,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17651.fixmate.bam,282a226fa617460fe1528d85627092ea,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17651.vcf.gz,574d64fae9cc72134842da0c2ae414d3,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17651.zarr.zip,20c2dccc0f1299b6cdc9ea96085fee49,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17652,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17652.fixmate.bam,bf8ef04a64df8c70486b137d5a80d26e,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17652.vcf.gz,45c173afe21ce8f997c5da51ec3bc33f,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17652.zarr.zip,bd655b278ff5a0ad3c69ec723909bbdb,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17653,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17653.fixmate.bam,ffba817cf0ec9659be7f89280c6e01a1,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17653.vcf.gz,3e4177f2db7843d9d9058b9558cf62a4,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17653.zarr.zip,b4a441b7ae279ad913199607a983d075,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17654,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17654.fixmate.bam,506ea2650b25d481b784c3c81ba8982b,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17654.vcf.gz,c8057da801dbf2a7e72c24457e303bae,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17654.zarr.zip,8dd701f9bb49759460f4b414906c6e3e,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17657,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17657.fixmate.bam,3b177bac830b81a34cf715a3e4b4fda6,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17657.vcf.gz,513f4d55e2b765776d72dab8480848a1,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17657.zarr.zip,dceb98a570d9a42925a0cba9fe97590c,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17658,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17658.fixmate.bam,55e7a7da598746cb41c934ecd7d0d6e1,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17658.vcf.gz,39ebc3d827cc94814b2af237c02f598a,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17658.zarr.zip,0d08edd845eb1ac23a63273b3dffccab,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17659,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17659.fixmate.bam,a9557efdf1f215595707a609b1837aa7,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17659.vcf.gz,65d4acb4bcb397ed36a93911ec9a87d7,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17659.zarr.zip,6abb24595d357defc6b1b3ec5a0ae80e,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17660,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17660.fixmate.bam,1f8650a372829941c12bb403124246fe,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17660.vcf.gz,00e870dc71b54109a98f7b4c194b5e97,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17660.zarr.zip,0f9ba9bcea0778c50d163d547a724ff9,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17661,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17661.fixmate.bam,4cdd9b15a9fe3eedcda70a9cf9c2f067,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17661.vcf.gz,f775ea3995135600b107fbb62055a8b0,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17661.zarr.zip,0e454220cf8978e7964adb9ec598b0bb,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17663,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17663.fixmate.bam,d14e46424d40ff85ccbc55c761248672,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17663.vcf.gz,e1565d5261ffb4b852df6fe3343d8da6,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17663.zarr.zip,09a688f80bb774e25a9fa0672897414d,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17664,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17664.fixmate.bam,133f4c5e502d6954f0426a8d7fc432a5,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17664.vcf.gz,f0d87579fe164a3ea0f11d709039174e,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17664.zarr.zip,3df8c8d7d2bff61f580356ecbbdd6a9e,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17665,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17665.fixmate.bam,2a726b7a4c6de2b4420f0170f7eb721b,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17665.vcf.gz,9aade66062f4bf5c53baf62f0f8f82fe,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17665.zarr.zip,2578e975e3e4a2338160fddeb72e3d16,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17666,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17666.fixmate.bam,e1a0f1d6caf86b7444d48218ab045e0a,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17666.vcf.gz,30d3d7d43268129ccd8ee46b724863b7,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17666.zarr.zip,cc4c0a78b6ed528999272c05acf5be10,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17668,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17668.fixmate.bam,16cf96edafb1e099e938dc9fac8a28e6,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17668.vcf.gz,74bf4743cee31620b4c54a26250e741a,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17668.zarr.zip,232bda5d1197ae10e4e7318e944c656d,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17669,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17669.fixmate.bam,2d7b5706dca9cd281bdce7b72f76ff7b,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17669.vcf.gz,6242e8fd8c336f283b360fe39e1a51f8,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17669.zarr.zip,f52ebb9c645db4627a1d40a8678599fc,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17672,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17672.fixmate.bam,80b6896089e21f8af9387b9cfae42856,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17672.vcf.gz,34a4937513d6f1c5f347dc82276e456d,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17672.zarr.zip,58b454fcd9a83d14a4148ad7b73a24c4,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17673,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17673.fixmate.bam,6087a24a761aa2766d8759bf980e908b,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17673.vcf.gz,79604a5cc4754abf36800f07b6a93058,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17673.zarr.zip,43e5551159bac39540afb03e66fad547,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17674,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17674.fixmate.bam,e8201a2658114e8b77b26c9756908dce,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17674.vcf.gz,e711d34b1a12c20d4ee29ce51aba962b,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17674.zarr.zip,206d8066a365447163cf82b2b0dedc5f,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17676,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17676.fixmate.bam,7d12547a8a43616c9f210a8b935d1a5f,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17676.vcf.gz,cc917417f8b56844d644784def9d68de,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17676.zarr.zip,da562a5ad1c841574276f746f76dc010,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17677,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17677.fixmate.bam,380245c46a69ed07b11398b18d589b4a,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17677.vcf.gz,1efa13bda3fa0988e1d7d57d2fa67122,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17677.zarr.zip,f7485977ae4d5a74fe4f35634f48eaa8,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17678,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17678.fixmate.bam,0ec6479e30e874e66577a6f0aaf1d2d8,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17678.vcf.gz,c84beb6954d7c3fe3cac41e3c3f66b56,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17678.zarr.zip,3c4e2b3f82da9e7d20c97d41fb304cc9,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17679,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17679.fixmate.bam,f06bab5e7278a01c81e92474b16c5669,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17679.vcf.gz,2a8a563db69ec4f57b18f09e0c3f1e8b,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17679.zarr.zip,b4a9232541e295c845eb95237b22c32b,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17680,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17680.fixmate.bam,9fcf174431eaf75075a4e5e3c11a1f91,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17680.vcf.gz,1e94d7fe9e58974fc0e409785743601d,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17680.zarr.zip,0b0c3b2edb4bdadfd2e4a0259758723d,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17681,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17681.fixmate.bam,8c8366162259c1f660c2261f7e296ce6,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17681.vcf.gz,ed6eb79844398867a2e6f97a17ca0dba,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17681.zarr.zip,66399f5fd7f1c876e0603e668d359922,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17683,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17683.fixmate.bam,c31ddb80b10fc99a5e10b8fe6cb1c7e4,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17683.vcf.gz,bf9465bfe6771dd871b3419ca491ec0f,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17683.zarr.zip,536181a8fe8ac84ea73fa6c71fd2c07d,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17684,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17684.fixmate.bam,71c69533b5eb81f2cd380afd7e47001f,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17684.vcf.gz,a7a65d6abe2493d625190fbec53068f0,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17684.zarr.zip,56856d07d8ed93a453415f74bc914ab1,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17685,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17685.fixmate.bam,347a4cbdb98f7bf139e55b047d90cdc1,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17685.vcf.gz,c81eaae33d3ecc92a074db92d879a71c,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17685.zarr.zip,1c8e3ada405adfaafff492eec303c78a,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17686,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17686.fixmate.bam,16f50de48d42cd7623bfc078c0427382,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17686.vcf.gz,4390513376769015bd738e92d6f1b687,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17686.zarr.zip,91724c7ea6808b99162185b649d71817,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17687,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17687.fixmate.bam,57f9c95ed69913033b23319bc42cfbfa,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17687.vcf.gz,df6b0f70829c1294e3382cce549dd937,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17687.zarr.zip,d6c5205a74e455d46c5a9bbde0450543,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17689,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17689.fixmate.bam,7a35e7059b49a07b7b89244ff8931f17,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17689.vcf.gz,2253b89d0418e5948c3bdb2a80e45d6d,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17689.zarr.zip,b874ecbdca52e07cd3a81d022d0e16ec,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17690,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17690.fixmate.bam,9242ae7f87af953a24c50c4469e5f684,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17690.vcf.gz,5c951100dd8067ad63f80ada03a3277d,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17690.zarr.zip,a829ee1899215b516331bde234981924,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17691,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17691.fixmate.bam,73c9c2661072e6257de744a8516fa099,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17691.vcf.gz,937b9aeba27a5f2687e1ca04e5ac449b,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17691.zarr.zip,95b94a5a12b71e1a6446e4d3ff5b03df,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17692,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17692.fixmate.bam,f15c1f79a867a8afd1493b6ad005acc2,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17692.vcf.gz,0378626ad29c99662b9eb6a03785f697,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17692.zarr.zip,e1f5edc5e5d680056fe2509254c5701f,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17693,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17693.fixmate.bam,ceff2ccd94903bd3995db0269a50e034,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17693.vcf.gz,86dd444cedaeff003dd743f240f44ddf,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17693.zarr.zip,a3b6981882f2959be192d8716549c0fb,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17694,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17694.fixmate.bam,84574d343a0ea1a34e6ba88954d21ad8,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17694.vcf.gz,ea38891eb28f8191320f5238aa275536,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17694.zarr.zip,ea4c311735b7c2732a32f4fb5677b7d8,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17695,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17695.fixmate.bam,bac0bb1f865973dbcc77a9df13fb01ee,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17695.vcf.gz,49b07956957651c351bf487c9288de6c,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17695.zarr.zip,e8e3ee66fa3d03535dd2050426e5653a,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17696,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17696.fixmate.bam,692268ed6ed811445a962f1295d2c7ff,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17696.vcf.gz,dc1ce02590980a7bac426ecc9f88a56d,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17696.zarr.zip,3728f0a93a99f526ccbd0a22dbf4d445,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17697,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17697.fixmate.bam,ffd062582866bb10ea26f09b8a49b841,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17697.vcf.gz,f0a715c5f420c479664f94f57ea6aaab,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17697.zarr.zip,59087a34e9a240902d65c979b3a43970,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17698,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17698.fixmate.bam,677c52a6335c838aa0af637dcd330d18,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17698.vcf.gz,098c12526adeec599a2cf805b380dd42,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17698.zarr.zip,6a05d391fd0ce7b17b11ac76a799dfe0,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17699,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17699.fixmate.bam,33675e8e921ac4dee359e8b6754757f5,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17699.vcf.gz,d1cc1b8ea1f4c5fb8b68c828053e0ed4,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17699.zarr.zip,8d9fdad87d9cb3a8f16a6ccec267ec05,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17700,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17700.fixmate.bam,b819b5021518fe379c3278d135925387,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17700.vcf.gz,34e15c35230f572c3e75e508a3de2cc0,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17700.zarr.zip,b57fc8c359229229421b9601179ba066,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17701,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17701.fixmate.bam,f76928fd8ca3eece87d1fb23821e01f4,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17701.vcf.gz,b1af2c6cffc66a254dc111d7fc1a4871,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17701.zarr.zip,69e5ec7b765a9b80f027e2cef4795cf2,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17702,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17702.fixmate.bam,9044fb57de93faa23307df7087f21ad0,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17702.vcf.gz,6aa9f1e3088ef4919eae61ff68401c80,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17702.zarr.zip,15f692b92a459ca69962ff6239e8fe5f,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17703,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17703.fixmate.bam,b34672fe081ee4325a2c89aa504f8f64,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17703.vcf.gz,63bb4d69b38e1999b0231b9405f30052,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17703.zarr.zip,0af258d4e987f9b9b49a078fa843f0e8,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17704,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17704.fixmate.bam,4180372bca2ca9147eefee33175681c5,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17704.vcf.gz,3ea9aaf538032de218cf250cf4d0eccc,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17704.zarr.zip,43a1549a732a1f1e081ed663ade73e8e,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17705,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17705.fixmate.bam,5a3d466924a045cc35b0d08353f53d90,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17705.vcf.gz,3955ecd76cdfc432796f9515abcf295a,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17705.zarr.zip,c3e92d44f3a76923611c63016c04c2d5,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17706,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17706.fixmate.bam,e9686f20b1d115c566b8e488d67a577f,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17706.vcf.gz,457c7fb323f7681f87374fa55be60a53,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17706.zarr.zip,c2ed434787f2c34f2afaa4e67c1cbaf7,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17707,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17707.fixmate.bam,91be34cd995255dffec047ecb2fac164,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17707.vcf.gz,7711b4485dbb0b359a56ad714a9dccb3,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17707.zarr.zip,134fdc46fc8fea48051ced9b31a86a09,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17709,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17709.fixmate.bam,ac7dbebb9c3e223f9c145fb709b7e7f7,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17709.vcf.gz,ff501a8eabf7d50ae1f8fd9191f6d23f,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17709.zarr.zip,6cbb0e03cfd52f2f331ae14933f7a3c1,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17710,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17710.fixmate.bam,30a916a0f3265704af1edb575090a10e,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17710.vcf.gz,0dc86848f3522468f9e97786add37486,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17710.zarr.zip,e2b5df151060cb9dfc4eafe2f4db1880,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17711,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17711.fixmate.bam,e973d2232c56c18ac5b30a553d92c6a0,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17711.vcf.gz,febfb66d91885edbaed15efc07e78973,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17711.zarr.zip,dad33d3871fec53158a15b70b09a5be0,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17714,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17714.fixmate.bam,25fa78a97c707313bdfda8abfc6553dd,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17714.vcf.gz,9bc4730116d21b38f0f102d6571c900f,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17714.zarr.zip,b59f7419d1b44c818df4280b0d1d8cbf,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17715,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17715.fixmate.bam,aa0f15925cc7a7b594a0bbd7a59eaad1,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17715.vcf.gz,1165193a112a98e024a160309c183666,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17715.zarr.zip,3fbf8a5344a74b4b975c7ebd4cba3d55,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17716,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17716.fixmate.bam,23e1580132ca9fe32d60e9501cd8e2fb,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17716.vcf.gz,abaa4352bba17afd47e69e72acaaa569,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17716.zarr.zip,49eb8aa66f34a676cdde77e33f116e44,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17717,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17717.fixmate.bam,2410f9a17f221e8c6b92ec02d406f0c8,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17717.vcf.gz,1f877d0138e586765eeab59f2208f31e,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17717.zarr.zip,068d9ca3b734782828fafeaadccf00c3,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17718,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17718.fixmate.bam,7724fdf638a2251eb2e1880e0755f1f8,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17718.vcf.gz,0c049fe0cf2ec1d4a08ba5c38514da25,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17718.zarr.zip,46e8c3ac5b88dbb2a873cf1a3f464298,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17719,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17719.fixmate.bam,44d260a4cd335f3ec398ca31b71e2890,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17719.vcf.gz,8e6a70f94e212e2e61ab04083cc980e9,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17719.zarr.zip,f662d50d071e97b8253df8c399fced0a,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17720,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17720.fixmate.bam,eee1d804a6a1428c395930f25a3d0e46,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17720.vcf.gz,bfd2deb7db4374fa4262648544301396,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17720.zarr.zip,81efcb8613c11a7d128a02e366538ab9,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17721,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17721.fixmate.bam,1f9b0b2d7577bf5f43eb20b8f402900a,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17721.vcf.gz,2c3eb40f0e6d8d17e3dbabdf46ff5456,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17721.zarr.zip,1702c3d8e20119ad4fcd7de2daaadbaa,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17722,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17722.fixmate.bam,84f8845eec3deaf2178f73c34cf3a90e,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17722.vcf.gz,9a0e8509c194ab96c0be528762bcae7d,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17722.zarr.zip,420a6005f4b5919866f95e72209fe15c,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17723,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17723.fixmate.bam,acf7859300336b3c10c5370d8219a98f,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17723.vcf.gz,cbb670203c7db60dab96d514b7289356,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17723.zarr.zip,91133dd1dcbe053ec10dbb7fb2023f99,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17724,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17724.fixmate.bam,d840b3457c49bc53b5ec87969718fe00,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17724.vcf.gz,fe52f45c8af39c21eb7dab682ffd08d5,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17724.zarr.zip,942497304b34f25ded08d63f4e7b4c2c,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17725,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17725.fixmate.bam,63923f6c156b0aa5436ac31c5fd14e23,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17725.vcf.gz,841241a709d5c1dd857805c949cdca32,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17725.zarr.zip,ce5cf5ca3fbd929d54da117c8b9f6c83,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS17726,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17726.fixmate.bam,6c1922aa144e6a1a2c5772c8e5cfb0b0,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17726.vcf.gz,8da8942cd8f18603ac67fae07e675471,https://1232-vo-ke-ochomo-vmf00044.cog.sanger.ac.uk/VBS17726.zarr.zip,5dbfc13fb970a188c80b48abcc305713,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/samples.meta.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/samples.meta.csv
new file mode 100644
index 000000000..cd2fd699d
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/samples.meta.csv
@@ -0,0 +1,77 @@
+sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call
+VBS24095,1235-MZ-A-1600865.6,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24096,1235-MZ-A-1600477.1,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24097,1235-MZ-A-1600854.0,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24098,1235-MZ-A-1600863.2,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24101,1235-MZ-A-1601048.2,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24104,1235-MZ-A-1503825.8,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24105,1235-MZ-A-1600934.9,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24117,1235-MZ-A-1600856.4,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24118,1235-MZ-A-1600855.7,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24119,1235-MZ-A-1600853.3,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24120,1235-MZ-A-1600852.6,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24121,1235-MZ-A-1600851.9,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24122,1235-MZ-A-1504509.6,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24123,1235-MZ-A-1504510.2,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24124,1235-MZ-A-1600620.1,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24125,1235-MZ-A-1600619.5,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24126,1235-MZ-A-1600618.8,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24127,1235-MZ-A-1600850.2,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24128,1235-MZ-A-1600617.1,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24129,1235-MZ-A-1600869.4,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24130,1235-MZ-A-1600849.6,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24132,1235-MZ-A-1600847.2,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24135,1235-MZ-A-1504511.9,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24137,1235-MZ-A-1600842.7,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24138,1235-MZ-A-1600841.0,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24141,1235-MZ-A-1600838.0,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24142,1235-MZ-A-1600837.3,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24143,1235-MZ-A-1600833.5,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24144,1235-MZ-A-1600831.1,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24145,1235-MZ-A-1600814.4,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24147,1235-MZ-A-1600836.6,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24148,1235-MZ-A-1600835.9,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24149,1235-MZ-A-1600816.8,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24150,1235-MZ-A-1600830.4,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24152,1235-MZ-A-1504513.3,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24153,1235-MZ-A-1600829.8,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24154,1235-MZ-A-1600813.7,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24155,1235-MZ-A-1504289.7,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24156,1235-MZ-A-1600868.7,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24157,1235-MZ-A-1504516.4,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24159,1235-MZ-A-1600812.0,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24160,1235-MZ-A-1504518.8,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24161,1235-MZ-A-1600826.7,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24162,1235-MZ-A-1600822.9,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24163,1235-MZ-A-1600821.2,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24164,1235-MZ-A-1503791.6,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24165,1235-MZ-A-1600808.3,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24166,1235-MZ-A-1600809.0,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24167,1235-MZ-A-1504641.3,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24168,1235-MZ-A-1503788.6,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24169,1235-MZ-A-1600819.9,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24170,1235-MZ-A-1600818.2,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24171,1235-MZ-A-1504642.0,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24172,1235-MZ-A-1600897.7,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24173,1235-MZ-A-1600806.9,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24174,1235-MZ-A-1503867.8,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24175,1235-MZ-A-1503826.5,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24176,1235-MZ-A-1504301.6,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24177,1235-MZ-A-1503308.6,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24178,1235-MZ-A-1600803.8,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24179,1235-MZ-A-1600802.1,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,F
+VBS24180,1235-MZ-A-1600801.4,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24181,1235-MZ-A-1504507.2,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24182,1235-MZ-A-1503935.4,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24183,1235-MZ-A-1600823.6,Krijn Paaijmans,Mozambique,Palmeiras,2018,1,-25.265,32.877,M
+VBS24184,1235-MZ-A-1503789.3,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24185,1235-MZ-A-1503868.5,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24186,1235-MZ-A-1503286.7,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24187,1235-MZ-A-1503309.3,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24188,1235-MZ-A-1503936.1,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24189,1235-MZ-A-1503869.2,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24190,1235-MZ-A-1503345.1,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24191,1235-MZ-A-1503346.8,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24192,1235-MZ-A-1503305.5,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24193,1235-MZ-A-1503314.7,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
+VBS24194,1235-MZ-A-1504291.0,Krijn Paaijmans,Mozambique,Palmeiras,2018,2,-25.265,32.877,F
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/surveillance.flags.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/surveillance.flags.csv
new file mode 100644
index 000000000..043011f9b
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/surveillance.flags.csv
@@ -0,0 +1,80 @@
+sample_id,is_surveillance
+VBS24095,True
+VBS24096,True
+VBS24097,True
+VBS24098,True
+VBS24101,True
+VBS24104,True
+VBS24105,True
+VBS24117,True
+VBS24118,True
+VBS24119,True
+VBS24120,True
+VBS24121,True
+VBS24122,True
+VBS24123,True
+VBS24124,True
+VBS24125,True
+VBS24126,True
+VBS24127,True
+VBS24128,True
+VBS24129,True
+VBS24130,True
+VBS24132,True
+VBS24135,True
+VBS24137,True
+VBS24138,True
+VBS24141,True
+VBS24142,True
+VBS24143,True
+VBS24144,True
+VBS24145,True
+VBS24146,True
+VBS24147,True
+VBS24148,True
+VBS24149,True
+VBS24150,True
+VBS24151,True
+VBS24152,True
+VBS24153,True
+VBS24154,True
+VBS24155,True
+VBS24156,True
+VBS24157,True
+VBS24158,True
+VBS24159,True
+VBS24160,True
+VBS24161,True
+VBS24162,True
+VBS24163,True
+VBS24164,True
+VBS24165,True
+VBS24166,True
+VBS24167,True
+VBS24168,True
+VBS24169,True
+VBS24170,True
+VBS24171,True
+VBS24172,True
+VBS24173,True
+VBS24174,True
+VBS24175,True
+VBS24176,True
+VBS24177,True
+VBS24178,True
+VBS24179,True
+VBS24180,True
+VBS24181,True
+VBS24182,True
+VBS24183,True
+VBS24184,True
+VBS24185,True
+VBS24186,True
+VBS24187,True
+VBS24188,True
+VBS24189,True
+VBS24190,True
+VBS24191,True
+VBS24192,True
+VBS24193,True
+VBS24194,True
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/wgs_accession_data.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/wgs_accession_data.csv
new file mode 100644
index 000000000..51d73fd8e
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/wgs_accession_data.csv
@@ -0,0 +1,77 @@
+sample_id,run_ena
+VBS24095,ERR11871590
+VBS24096,ERR11871591
+VBS24097,ERR11871592
+VBS24098,ERR11871593
+VBS24101,ERR11871585
+VBS24104,ERR11871586
+VBS24105,"ERR11880064, ERR11871589"
+VBS24117,"ERR3766760, ERR3766791, ERR3766822"
+VBS24118,"ERR3795249, ERR3795279, ERR3795309"
+VBS24119,"ERR3766753, ERR3766784, ERR3766815"
+VBS24120,"ERR3766752, ERR3766783, ERR3766814"
+VBS24121,"ERR3766755, ERR3766786, ERR3766817"
+VBS24122,"ERR3795237, ERR3795267, ERR3795297"
+VBS24123,"ERR3766751, ERR3766782, ERR3766813"
+VBS24124,"ERR3795147, ERR3795176, ERR3795205"
+VBS24125,"ERR3795258, ERR3795288, ERR3795318"
+VBS24126,"ERR3795158, ERR3795187, ERR3795216"
+VBS24127,"ERR3795247, ERR3795277, ERR3795307"
+VBS24128,"ERR3795260, ERR3795290, ERR3795320"
+VBS24129,"ERR3766759, ERR3766790, ERR3766821"
+VBS24130,"ERR3795151, ERR3795180, ERR3795209"
+VBS24132,"ERR3795252, ERR3795282, ERR3795312"
+VBS24135,"ERR3795257, ERR3795287, ERR3795317"
+VBS24137,"ERR3795253, ERR3795283, ERR3795313"
+VBS24138,"ERR3795170, ERR3795199, ERR3795228"
+VBS24141,"ERR3795163, ERR3795192, ERR3795221"
+VBS24142,"ERR3766763, ERR3766794, ERR3766825"
+VBS24143,"ERR3795259, ERR3795289, ERR3795319"
+VBS24144,"ERR3795234, ERR3795264, ERR3795294"
+VBS24145,"ERR3795239, ERR3795269, ERR3795299"
+VBS24147,"ERR3795152, ERR3795181, ERR3795210"
+VBS24148,"ERR3795261, ERR3795291, ERR3795321"
+VBS24149,"ERR3795232, ERR3795262, ERR3795292"
+VBS24150,"ERR3795160, ERR3795189, ERR3795218"
+VBS24152,"ERR3795235, ERR3795265, ERR3795295"
+VBS24153,"ERR3795236, ERR3795266, ERR3795296"
+VBS24154,"ERR3795248, ERR3795278, ERR3795308"
+VBS24155,"ERR3795164, ERR3795193, ERR3795222"
+VBS24156,"ERR3795241, ERR3795271, ERR3795301"
+VBS24157,"ERR3766761, ERR3766792, ERR3766823"
+VBS24159,"ERR3795242, ERR3795272, ERR3795302"
+VBS24160,"ERR3795145, ERR3795174, ERR3795203"
+VBS24161,"ERR3795245, ERR3795275, ERR3795305"
+VBS24162,"ERR3795169, ERR3795198, ERR3795227"
+VBS24163,"ERR3795251, ERR3795281, ERR3795311"
+VBS24164,"ERR3766757, ERR3766788, ERR3766819"
+VBS24165,"ERR3795159, ERR3795188, ERR3795217"
+VBS24166,"ERR3795157, ERR3795186, ERR3795215"
+VBS24167,"ERR3795148, ERR3795177, ERR3795206"
+VBS24168,"ERR3795162, ERR3795191, ERR3795220"
+VBS24169,"ERR3795173, ERR3795202, ERR3795231"
+VBS24170,"ERR3795240, ERR3795270, ERR3795300"
+VBS24171,"ERR3795146, ERR3795175, ERR3795204"
+VBS24172,"ERR3795256, ERR3795286, ERR3795316"
+VBS24173,"ERR3795168, ERR3795197, ERR3795226"
+VBS24174,"ERR3795243, ERR3795273, ERR3795303"
+VBS24175,"ERR3795154, ERR3795183, ERR3795212"
+VBS24176,"ERR3795254, ERR3795284, ERR3795314"
+VBS24177,"ERR3795156, ERR3795185, ERR3795214"
+VBS24178,"ERR3795233, ERR3795263, ERR3795293"
+VBS24179,"ERR3795172, ERR3795201, ERR3795230"
+VBS24180,"ERR3795255, ERR3795285, ERR3795315"
+VBS24181,"ERR3795161, ERR3795190, ERR3795219"
+VBS24182,"ERR3795166, ERR3795195, ERR3795224"
+VBS24183,"ERR3795244, ERR3795274, ERR3795304"
+VBS24184,"ERR3795167, ERR3795196, ERR3795225"
+VBS24185,"ERR3795171, ERR3795200, ERR3795229"
+VBS24186,"ERR3795238, ERR3795268, ERR3795298"
+VBS24187,"ERR3795250, ERR3795280, ERR3795310"
+VBS24188,"ERR3795149, ERR3795178, ERR3795207"
+VBS24189,"ERR3766756, ERR3766787, ERR3766818"
+VBS24190,"ERR3795150, ERR3795179, ERR3795208"
+VBS24191,"ERR3766762, ERR3766793, ERR3766824"
+VBS24192,"ERR3795155, ERR3795184, ERR3795213"
+VBS24193,"ERR3795153, ERR3795182, ERR3795211"
+VBS24194,"ERR3766758, ERR3766789, ERR3766820"
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/wgs_snp_data.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/wgs_snp_data.csv
new file mode 100644
index 000000000..46dc3e52c
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1235-VO-MZ-PAAIJMANS-VMF00094/wgs_snp_data.csv
@@ -0,0 +1,77 @@
+sample_id,alignments_bam,alignments_bam_md5,snp_genotypes_vcf,snp_genotypes_vcf_md5,snp_genotypes_zarr,snp_genotypes_zarr_md5,pipeline_version
+VBS24095,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24095.fixmate.bam,43bfe1e01a923456e19df07a5a6cf17e,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24095.vcf.gz,d35fbd8d85e7af27ac0bb140dd6faed2,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24095.zarr.zip,7e4cb3d27f7ef1e195ffdd6c739d7dbc,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24096,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24096.fixmate.bam,ab0f2b6c01d6677aaa7884c2f3c3c3dd,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24096.vcf.gz,f9c7e0b5b72d2fa197bd3046a24e34cb,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24096.zarr.zip,71c0d291f64e0cc3fbb32027a5388dab,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24097,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24097.fixmate.bam,6a5275ecb90e7b8e157a74066f9f4357,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24097.vcf.gz,bd64b5a45ad30971dab9a0d28ef7c45b,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24097.zarr.zip,db4c4b166f7d17d56c6a7d5ea1a2d987,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24098,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24098.fixmate.bam,60e5ea6666423871d0e751cb7738018b,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24098.vcf.gz,3bce4e01ba97932f8fbbb94403ac4ddc,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24098.zarr.zip,b98ab2777cc9b4aecc112d10301ad5d5,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24101,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24101.fixmate.bam,dc86d97a68688e2f51d3f258691375a1,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24101.vcf.gz,4b9bac431701d4f18f2d163318daf0e2,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24101.zarr.zip,3cf24f548defd70ee9e72c621e9e6452,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24104,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24104.fixmate.bam,75b4dac7caa9e1969327c1ec8894ef00,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24104.vcf.gz,12898d5f12375183642649a56afadade,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24104.zarr.zip,6674d33422c955c72b96925451a27dc4,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24105,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24105.fixmate.bam,2696a903390d98ef0e4687f9abf8418b,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24105.vcf.gz,12fb5e9124054fde3b07a672c1bcd08d,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24105.zarr.zip,a9025fb594599929e00bb3f8a41b5282,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24117,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24117.fixmate.bam,4250c11d23d59abf2ca72aaf70a374dd,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24117.vcf.gz,120f56d015e4e51cc65c7bacee82f1de,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24117.zarr.zip,367d1a6fd4ca46ba12cac637e07cf88c,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24118,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24118.fixmate.bam,121db8535f5682b283493a387690328e,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24118.vcf.gz,a2293cd8be282d994f445e0d2643d427,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24118.zarr.zip,65f5904d954224e12d365382e9391779,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24119,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24119.fixmate.bam,ff1e8256a5932bbebcca7aa8222af6e4,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24119.vcf.gz,ef9ac67a0c1a372d0089c60985d86e31,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24119.zarr.zip,015996161c70cbf663569a619e3a88d7,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24120,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24120.fixmate.bam,5afe70f74873291a5f5f4759ba58bf43,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24120.vcf.gz,0d76ed60fc3fedf5d6a409fc3c60f042,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24120.zarr.zip,9d4284f120a0ae97eeae543b35d536c5,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24121,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24121.fixmate.bam,5a4389bc8b4c10bcf0689352b9347912,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24121.vcf.gz,de475e42e92ad787d7e62c56abf5d316,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24121.zarr.zip,af9d88a2354cc07e0b2847b8aedfe45f,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24122,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24122.fixmate.bam,72756fe04452dc6ffa2f7af04e31dde2,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24122.vcf.gz,3b4f67c3faa5e502e3cfc0909315275c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24122.zarr.zip,d9d75b05117b06e2f2c629dd7060bedb,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24123,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24123.fixmate.bam,10c71eca4bb47e048532640cd267f274,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24123.vcf.gz,927dc98eef7496fa237d151822ab372c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24123.zarr.zip,a826d037203380da19f6dd3dd0e39901,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24124,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24124.fixmate.bam,631983c823d7147af2d0fb7833f63b12,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24124.vcf.gz,8f4af5e09c5af0bc68f7671804c6ea38,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24124.zarr.zip,adc4813094468abd5ab973f54bdb2977,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24125,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24125.fixmate.bam,2b8a203283db92cd369b931b47081132,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24125.vcf.gz,9abfc1440a44d3ad45943b6817a777ab,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24125.zarr.zip,19ec608958cbeb35c20afe3130b0507a,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24126,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24126.fixmate.bam,87d6468a3dd2ef201716a37ad45b75b5,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24126.vcf.gz,e60e747e0ac50fb2a464024a2873e1f7,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24126.zarr.zip,306a5779fd65be4f6bfb87af6f67bbbd,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24127,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24127.fixmate.bam,ade20cf8b8f78882f6123d381c2d4385,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24127.vcf.gz,7fee90e8d856e61fc3a16143730b9571,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24127.zarr.zip,3db47130d3a3cfcb1b85362cbc7b7147,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24128,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24128.fixmate.bam,97e1d7c7bfe8906966fe9c95c6bd9cf6,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24128.vcf.gz,92bda2a9996bb65fed67d20239fbfb2c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24128.zarr.zip,0faac9c3522c7de8dceba1210016166c,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24129,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24129.fixmate.bam,041db92e055f8dcb2cc5ca5703f993a9,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24129.vcf.gz,8a7a050d8f1fc0d0da546fcc6ceedff8,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24129.zarr.zip,44a7836f663716079184d82e3957dbba,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24130,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24130.fixmate.bam,affe9bda5d8e651ef1a6e7b521eb1267,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24130.vcf.gz,1617e9c98d366200798f7bf5354f5d5f,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24130.zarr.zip,b66635b1622daf84d82ba1d0e558778b,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24132,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24132.fixmate.bam,63f1f9a06f80d569d1dcffac52f31d00,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24132.vcf.gz,e50d630185f761c00b149909f4a65f13,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24132.zarr.zip,80f94a24174eef6814c407852fb74196,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24135,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24135.fixmate.bam,a73db470faaaaab0d63f16951a99a5de,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24135.vcf.gz,0ed70bb6bdae84c8fa7a47c2837d404f,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24135.zarr.zip,d6f3347d3afa82e9c6f748ed0c14ba82,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24137,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24137.fixmate.bam,80aff105c7d5b08acf8f892ba064220c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24137.vcf.gz,c44c82f6f3788354dabdc4d5c2e220fe,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24137.zarr.zip,75453b10bef61604f12afa03e4d2bffd,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24138,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24138.fixmate.bam,ed21138a3471add892904c6a25f27a8f,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24138.vcf.gz,6468fe510c89e889c2bb444a4f43ab34,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24138.zarr.zip,c2b65c65ff900cf1d86774ea783ff7dc,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24141,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24141.fixmate.bam,a986c027d0ff9f175f155901d79f2528,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24141.vcf.gz,33d23811a768a44c31652c7e6f123fb0,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24141.zarr.zip,edaf0b5b2888b95b7c036684f9ebbf8f,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24142,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24142.fixmate.bam,8477e96b9365dd9eb20a90e191258747,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24142.vcf.gz,d94a777854d5749bbbb02d2d088f65a8,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24142.zarr.zip,2b56bcb72399147783cf5c416dd50fc2,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24143,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24143.fixmate.bam,ec0a3343b6b54e8195868f5605f26337,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24143.vcf.gz,6ae087b689db651b89a762e3f5c99b7d,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24143.zarr.zip,e15848930be0435bc627a82c228a2a56,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24144,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24144.fixmate.bam,309b0a66a48e466b14ca6e03fa09f8c6,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24144.vcf.gz,473d7255ec951099217ae04c39dace77,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24144.zarr.zip,20d1dc5609721ce442daac29e6af95af,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24145,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24145.fixmate.bam,34b866caa916f60f29b4261070f87f47,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24145.vcf.gz,08951b40291bff1865a3d0dd70f9fd7f,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24145.zarr.zip,77aaf53a3d0f12001d83af89829ce329,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24147,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24147.fixmate.bam,4057e1e827b990242f08baf9ced3aff3,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24147.vcf.gz,dbe8b9818dcf5f264213e3a890b76685,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24147.zarr.zip,ed527aa66396ba6fe90fbafbe62f37ca,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24148,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24148.fixmate.bam,7a65d16cf6034674b8d91d745e921195,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24148.vcf.gz,8c500a39554387e95611d4110fc8910c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24148.zarr.zip,b733235dc8d1b6b54da73cdc0f1b96b5,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24149,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24149.fixmate.bam,e3e4e92e3cdde668bc297ccdc57e496c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24149.vcf.gz,2ab8f438dd41c6875d25a950411a30dd,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24149.zarr.zip,693942d111a6f258349874e5d72fb303,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24150,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24150.fixmate.bam,d7f5faf5521138fad9d12927abc92fd1,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24150.vcf.gz,33c8c19f4e6526c6934c8d7949cb2806,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24150.zarr.zip,abccdb2319bf3ad35a21ed49bca11d08,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24152,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24152.fixmate.bam,9c7eb615bcec4951e5e49c27b2d96329,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24152.vcf.gz,6ad4e73dadeae19b06dc383610fe939a,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24152.zarr.zip,a10ff0d48393bba12f1a53add465fa3a,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24153,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24153.fixmate.bam,8ccba0ba353050f259960178c2f49241,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24153.vcf.gz,867c883c90442a331b6efe526f49e599,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24153.zarr.zip,2fb61c99ff000bc55d99c43ddfe3cb97,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24154,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24154.fixmate.bam,64c782c2935d76dd16ad2e28065aef14,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24154.vcf.gz,4021ed25e208bcba8ab03209c2db96c4,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24154.zarr.zip,23a967359a84037ce4fe97ea9df00d22,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24155,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24155.fixmate.bam,a3d04577e19bd01c210eb3e03c68a03b,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24155.vcf.gz,531ca136dc4b2cabe8ee92c76a2118b5,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24155.zarr.zip,5ef8fa77cd2cb1482edaf81d9003b9a1,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24156,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24156.fixmate.bam,8a6d9be4646f483abc0ab8fdde37e302,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24156.vcf.gz,0b078d1658cf818f235e68663664a54c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24156.zarr.zip,8383e7ad1c702ed045a6b0688685b963,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24157,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24157.fixmate.bam,518378595c8e82bbd5918958853cc469,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24157.vcf.gz,4f3ac4cc189ac524e740d26cb7f88d90,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24157.zarr.zip,4533b72772a0491ddcead2e0f0cdbb3e,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24159,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24159.fixmate.bam,82a5ff829b5e5355b420e810bd8846f2,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24159.vcf.gz,df5e97344a02e2b2f1460a745be12792,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24159.zarr.zip,b129909d5a5f73e9e48ddf695ed1f5b1,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24160,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24160.fixmate.bam,70ea4dc94ab52b8fc640c4018abbbe2c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24160.vcf.gz,1fd88e2e4575cd589ec48fd8e8573b7f,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24160.zarr.zip,ab3b81e0b63476200ca3fbcf18f660df,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24161,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24161.fixmate.bam,231a2230e8d190ccc2e0602484eb9f00,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24161.vcf.gz,655159ce615220186894e4928dde44f7,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24161.zarr.zip,e7ab013096871ec4d5fa8e740de1548b,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24162,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24162.fixmate.bam,b5380e105ae66a2ea53812830e22ab57,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24162.vcf.gz,ffd9958275bcbbc76c104580f383a852,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24162.zarr.zip,ff2b2516444350f6e755fe931c12a4a9,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24163,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24163.fixmate.bam,82a5e308f325b4f1d0534aeafd66a438,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24163.vcf.gz,bb6b14ba298575350ebe810a4b1d180e,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24163.zarr.zip,e3f5a8bfe4fcdd09529fac838c6f284f,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24164,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24164.fixmate.bam,4eb99b9e0fe058b41eba708d6b2cecc0,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24164.vcf.gz,66864528a50ddac5e9db5cd523470995,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24164.zarr.zip,31c310138134f73fdffc7f8bfa997d27,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24165,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24165.fixmate.bam,e159585716ddf77646b6d77e690d4c8c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24165.vcf.gz,c00979c58d8a72fcc58590fbf9a78d44,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24165.zarr.zip,5bfbf1af66d50ab0a81b5bed1dc45919,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24166,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24166.fixmate.bam,0980e59068989080e010d6a9a630b143,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24166.vcf.gz,666f0a2c6c61b96a5611dcf40cd724ed,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24166.zarr.zip,44d3ded8335fb60dced1e48a026ab0e9,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24167,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24167.fixmate.bam,e3c2dce4e7a94ad4773bba30fa86a14b,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24167.vcf.gz,c2d78b2836a132d3b751e3393971f6d5,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24167.zarr.zip,f73956d26d56c5c297d21f131351a066,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24168,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24168.fixmate.bam,b02fd401cd47889a5c6e5e8b9eb1988e,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24168.vcf.gz,81327ec566a9f3d15d988ac1bca3c82a,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24168.zarr.zip,2700725dfd74ee83293b2d8d42086301,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24169,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24169.fixmate.bam,b5f916072f04915d35b77bf3cb935819,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24169.vcf.gz,dcf1a0e25b923b68164e744d17e78998,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24169.zarr.zip,0cb54389288de3f186e2c171576bcc27,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24170,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24170.fixmate.bam,fed96af59485e263e35d88838d64e54f,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24170.vcf.gz,327ea555c6a2d51ee3876c4874bc7e93,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24170.zarr.zip,64f9775bfb73a911d022c7b23cc3e49e,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24171,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24171.fixmate.bam,59c755ac6dc6e04b68f619ce1d1d4cf8,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24171.vcf.gz,960250a3d96544211a036e952d7eb54c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24171.zarr.zip,dc4490e04703e32b0018b7f0677decd5,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24172,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24172.fixmate.bam,b533a67c4a682c4dd38a039ecb3b7f57,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24172.vcf.gz,845048eed7d8f47b07b271157962f7b1,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24172.zarr.zip,7ab5616473dca129266d62829bded45f,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24173,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24173.fixmate.bam,1c702357ac0ca7be3247cffaca7b77c5,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24173.vcf.gz,7518d83a02eb279aecf997d3f037a01d,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24173.zarr.zip,bcd0b534fd6c4e0dc00a3aabb4752c7b,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24174,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24174.fixmate.bam,ea2ee1425371366795594c120c6c1c4e,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24174.vcf.gz,871ccfa62008d8085e75c199fdaed641,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24174.zarr.zip,07f3a7652e6344402abe2acb3d1d090f,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24175,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24175.fixmate.bam,bd2adec40ff8ce292d498d69af5ae31f,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24175.vcf.gz,8d68a06baa08a76a426608607d4d8c8b,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24175.zarr.zip,76d00caefe2a043c7c2708ec9dbbcac4,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24176,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24176.fixmate.bam,e3fb8a151fac68c63081d6b4f0e664ae,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24176.vcf.gz,df7d5a2197f10b8cb78a797324aab54e,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24176.zarr.zip,7b1604ea5f78dcf9deb05c06bdeb4f0f,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24177,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24177.fixmate.bam,582594b981aeec31ee7ab2b7e9714407,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24177.vcf.gz,ac058672f238cefff413207c6f965e96,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24177.zarr.zip,52bf2ae31e2c3217ad5e919d45958497,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24178,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24178.fixmate.bam,f06cec2ff02b273883dcd70135ae0830,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24178.vcf.gz,4335bdc194276ad274d13d43caf4eeaa,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24178.zarr.zip,33237f171e167969c31fc29ba6a52608,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24179,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24179.fixmate.bam,f49da526a069e4a29e2e6a57c0171428,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24179.vcf.gz,83b968edf8c5c4413371bf10251c9024,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24179.zarr.zip,8e797231b9d09ec05ba7bb4e865c97b8,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24180,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24180.fixmate.bam,69f19ab9f83f8ac97c8e6f753a463898,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24180.vcf.gz,ae12d5fead1c13d1d96b54417908409c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24180.zarr.zip,987225071d15724b271cf232f7e3b422,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24181,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24181.fixmate.bam,39e6c10fc2b705e8ccb96004ebf8c723,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24181.vcf.gz,02bdd45eeec8f403df1d06b19b10591d,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24181.zarr.zip,ae5d260fd022f3a2cce393466a1ba35d,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24182,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24182.fixmate.bam,7e14e65bef77b32d86299a3df7443482,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24182.vcf.gz,6103b3473fa9bead92b559652cad507c,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24182.zarr.zip,2f5e109421b1c9690485fcce34fc11b6,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24183,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24183.fixmate.bam,fc18ed974bd155e5a99ec7f59b1b634a,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24183.vcf.gz,d7d7d4e6e2914544d10c3fbf5e19cf89,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24183.zarr.zip,7d1566e3a206e725acb25f3f9547bdb6,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24184,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24184.fixmate.bam,9405e2dff85d3216e009ec397ea5147a,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24184.vcf.gz,736cbb135c916936b8c85a0216be8d8a,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24184.zarr.zip,4bfbbde06360311c030ae7821f528f89,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24185,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24185.fixmate.bam,0c47680197125a8e65b727153c1693a8,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24185.vcf.gz,afd611c8dddd596adeebe278b3d1c0ca,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24185.zarr.zip,3e7ebf0f1fd00fe4fd26ee95e40539cb,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24186,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24186.fixmate.bam,f74e8f7b3b25a05e664c9675385372df,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24186.vcf.gz,e972229d54280e2f8dbec4cd5bc478e3,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24186.zarr.zip,b010e946838903f7558efb594d5bb499,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24187,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24187.fixmate.bam,6d7083d103d1a760ab23485844f76891,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24187.vcf.gz,95f69dcec7b65836190d8bb6c3b95cfd,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24187.zarr.zip,5b0d634fbd9e4337b679be826ce88898,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24188,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24188.fixmate.bam,b5b6227aa2bd1ac345b0ad7c5e604094,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24188.vcf.gz,37bf52e21074876008834118bc463ddb,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24188.zarr.zip,900a30f089ad7439ab022be0e34870d0,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24189,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24189.fixmate.bam,cc1daa93c2395a86656f808924fea817,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24189.vcf.gz,c732d936af084135ef0042d61ac8237f,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24189.zarr.zip,d62427385d3f4e8c351cf6db5ab62ebf,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24190,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24190.fixmate.bam,0d2efc6a730d8eb39c3278feb958e48e,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24190.vcf.gz,6d632344cb30a2cd6c10b4c6d0614a33,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24190.zarr.zip,89dabcf282e13cfb41cb9e3cbb653a3a,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24191,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24191.fixmate.bam,c7aff814d70c1601237739a86cab1a37,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24191.vcf.gz,465a59532e48d692311dc71e4373d74f,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24191.zarr.zip,d1d3fa0a419f549aa5d206e940dbe7bd,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24192,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24192.fixmate.bam,d077dd053e469e37eb0051c7d38a5342,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24192.vcf.gz,c5e1df30e1806c95e50cfc47f6d26ce8,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24192.zarr.zip,62cc7c5b7aaac6193fe3f2eea4a589f1,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24193,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24193.fixmate.bam,0aa9a61a3949e8edc42c0524ccbfca1a,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24193.vcf.gz,3191dc1a71c85eaec03bc8f14f8edd1f,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24193.zarr.zip,bdde46d9f6f2e2ffb596059c79aa44b1,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
+VBS24194,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24194.fixmate.bam,f72f4424f1121d651e0eeab5609df38d,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24194.vcf.gz,19e0de154aa3eb0bb3fa4eec16a25783,https://1235-vo-mz-paaijmans-vmf00094.cog.sanger.ac.uk/VBS24194.zarr.zip,a57f0f22ae2c164e02d3303f03d3c39c,https://github.com/malariagen/pipelines/releases/tag/v0.0.7
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
index 46e84a8be..d4d50e62b 100644
--- a/tests/anoph/test_sample_metadata.py
+++ b/tests/anoph/test_sample_metadata.py
@@ -306,7 +306,7 @@ def test_general_metadata_with_multiple_sample_sets(
@parametrize_with_cases("fixture,api", cases=".")
def test_general_metadata_with_release(fixture, api: AnophelesSampleMetadata):
# Set up the test.
- release = random.choice(api.releases)
+ release = random.choice(api.relevant_releases)
# Call function to be tested.
df = api.general_metadata(sample_sets=release)
@@ -390,7 +390,7 @@ def test_sequence_qc_metadata_with_multiple_sample_sets(
@parametrize_with_cases("fixture,api", cases=".")
def test_sequence_qc_metadata_with_release(fixture, api: AnophelesSampleMetadata):
# Set up the test.
- release = random.choice(api.releases)
+ release = random.choice(api.relevant_releases)
# Call function to be tested.
df = api.sequence_qc_metadata(sample_sets=release)
@@ -609,7 +609,7 @@ def test_cohorts_metadata_with_multiple_sample_sets(
@parametrize_with_cases("fixture,api", cases=".")
def test_cohorts_metadata_with_release(fixture, api: AnophelesSampleMetadata):
# Set up test.
- release = random.choice(api.releases)
+ release = random.choice(api.relevant_releases)
# Call function to be tested.
df = api.cohorts_metadata(sample_sets=release)
@@ -719,7 +719,7 @@ def test_sample_metadata_with_multiple_sample_sets(
@parametrize_with_cases("fixture,api", cases=".")
def test_sample_metadata_with_release(fixture, api: AnophelesSampleMetadata):
# Set up test.
- release = random.choice(api.releases)
+ release = random.choice(api.relevant_releases)
# Call function to be tested.
df = api.sample_metadata(sample_sets=release)
@@ -743,7 +743,7 @@ def test_sample_metadata_with_duplicate_sample_sets(
fixture, api: AnophelesSampleMetadata
):
# Set up test.
- release = random.choice(api.releases)
+ release = random.choice(api.relevant_releases)
df_sample_sets = api.sample_sets(release=release).set_index("sample_set")
all_sample_sets = df_sample_sets.index.to_list()
sample_set = random.choice(all_sample_sets)
From 1b6cbb9c16de9ca803a93bd3db9ec8b0d507687c Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Tue, 18 Feb 2025 15:43:24 +0000
Subject: [PATCH 03/32] Update comment re skipping test due to lack of relevant
fixtures
---
tests/anoph/test_sample_metadata.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
index d4d50e62b..080b32816 100644
--- a/tests/anoph/test_sample_metadata.py
+++ b/tests/anoph/test_sample_metadata.py
@@ -269,7 +269,8 @@ def test_general_metadata_with_single_sample_set(fixture, api: AnophelesSampleMe
sample_count = df_sample_sets["sample_count"]
all_sample_sets = df_sample_sets.index.to_list()
- # FIXME: we should probably add more sample sets to the fixtures to test combinations of unrestricted_use_only and surveillance_use_only.
+ # Skip this test if there are no relevant sample sets to test, e.g. due to unrestricted_use_only and surveillance_use_only.
+ # Note: there should be sufficient test fixtures to run this test, i.e. including unrestricted and surveillance sample sets.
if len(all_sample_sets) == 0:
pytest.skip("Skipping because there are no relevant sample sets to test.")
From 02921c9df5234b1bfd697dba10d76d09c22006f9 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 20 Feb 2025 12:29:20 +0000
Subject: [PATCH 04/32] Add surveillance flags to sample_metadata(). Add tests.
---
malariagen_data/anoph/sample_metadata.py | 100 +-
tests/anoph/conftest.py | 50 +
.../surveillance.flags.csv | 45 +
.../surveillance.flags.csv | 52 +
.../surveillance.flags.csv | 436 ++++++
.../surveillance.flags.csv | 1304 +++++++++++++++++
.../general/AG1000G-AO/surveillance.flags.csv | 135 ++
.../AG1000G-BF-A/surveillance.flags.csv | 334 +++++
tests/anoph/test_sample_metadata.py | 19 +-
9 files changed, 2470 insertions(+), 5 deletions(-)
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1229-VO-GH-DADZIE-VMF00095/surveillance.flags.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1230-VO-GA-CF-AYALA-VMF00045/surveillance.flags.csv
create mode 100644 tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1231-VO-MULTI-WONDJI-VMF00043/surveillance.flags.csv
create mode 100644 tests/anoph/fixture/vo_agam_release_master_us_central1/v3.1/metadata/general/1177-VO-ML-LEHMANN-VMF00004/surveillance.flags.csv
create mode 100644 tests/anoph/fixture/vo_agam_release_master_us_central1/v3/metadata/general/AG1000G-AO/surveillance.flags.csv
create mode 100644 tests/anoph/fixture/vo_agam_release_master_us_central1/v3/metadata/general/AG1000G-BF-A/surveillance.flags.csv
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 3088508ba..804ddc9e0 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -1,6 +1,7 @@
import io
from itertools import cycle
from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
+import warnings
import ipyleaflet # type: ignore
import numpy as np
@@ -329,6 +330,87 @@ def sequence_qc_metadata(
sample_sets=sample_sets,
)
+ def _parse_surveillance_flags(
+ self, sample_set: str, data: Union[bytes, Exception]
+ ) -> pd.DataFrame:
+ # Get the current warning filters.
+ original_warning_filters = warnings.filters[:]
+
+ # Specify the expected data type for each column.
+ dtype = {
+ "sample_id": "object",
+ "is_surveillance": "bool",
+ }
+
+ if isinstance(data, bytes):
+ # Read the CSV data.
+ df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
+
+ # If there are any nulls in these data, show a warning.
+ if df.isnull().values.any():
+ # Trigger the warning.
+ warnings.simplefilter("default", UserWarning)
+ warnings.warn(
+ f"WARNING: The surveillance flags data contains null values for sample set {sample_set}",
+ UserWarning,
+ )
+
+ # Restore the original warning filters.
+ warnings.filters = original_warning_filters
+
+ # Ensure all column names are lower case.
+ df.columns = [c.lower() for c in df.columns] # type: ignore
+
+ return df
+
+ elif isinstance(data, FileNotFoundError):
+ # Surveillance flags are missing for this sample set.
+ # Show a warning and return a blank DataFrame.
+
+ # Trigger the warning.
+ warnings.simplefilter("default", UserWarning)
+ warnings.warn(
+ f"WARNING: The surveillance flags data is missing for sample set {sample_set}",
+ UserWarning,
+ )
+
+ # Restore the original warning filters.
+ warnings.filters = original_warning_filters
+
+ # Get a copy of the sample ids.
+ df_general = self.general_metadata(sample_sets=sample_set)
+ df = df_general[["sample_id"]].copy()
+
+ # Set each column value to null.
+ df["is_surveillance"] = np.nan
+
+ # Set the data type.
+ df = df.astype(dtype)
+
+ return df
+
+ else:
+ raise data
+
+ @check_types
+ @doc(
+ summary="""
+ Access surveillance flags for one or more sample sets.
+ """,
+ returns="""A pandas DataFrame, one row per sample. The columns are:
+ `sample_id` is the identifier of the sample,
+ `is_surveillance` indicates whether the sample can be used for surveillance,
+ """,
+ )
+ def surveillance_flags(
+ self, sample_sets: Optional[base_params.sample_sets] = None
+ ) -> pd.DataFrame:
+ return self._parse_metadata_paths(
+ path_template="{release_path}/metadata/general/{sample_set}/surveillance.flags.csv",
+ parse_metadata_func=self._parse_surveillance_flags,
+ sample_sets=sample_sets,
+ )
+
@property
def _cohorts_analysis(self):
if self._cohorts_analysis_override:
@@ -589,15 +671,23 @@ def sample_metadata(
df_samples = self.general_metadata(sample_sets=prepped_sample_sets)
# Merge with the sequence QC metadata.
+ # Note: merging can change column dtypes, e.g. due to new NaNs.
df_sequence_qc = self.sequence_qc_metadata(
sample_sets=prepped_sample_sets
)
-
- # Note: merging can change column dtypes
df_samples = df_samples.merge(
df_sequence_qc, on="sample_id", sort=False, how="left"
)
+ # Merge with the surveillance flags.
+ # Note: merging can change column dtypes, e.g. due to new NaNs.
+ df_surveillance_flags = self.surveillance_flags(
+ sample_sets=prepped_sample_sets
+ )
+ df_samples = df_samples.merge(
+ df_surveillance_flags, on="sample_id", sort=False, how="left"
+ )
+
# If available, merge with the AIM metadata.
if self._aim_analysis:
df_aim = self.aim_metadata(sample_sets=prepped_sample_sets)
@@ -612,6 +702,10 @@ def sample_metadata(
df_cohorts, on="sample_id", sort=False, how="left"
)
+ # If surveillance_use_only, restrict to samples with is_surveillance.
+ if "is_surveillance" in df_samples.columns and self._surveillance_use_only:
+ df_samples = df_samples[df_samples["is_surveillance"].astype(bool)]
+
# Store sample metadata in the cache.
self._cache_sample_metadata[cache_key] = df_samples
@@ -619,7 +713,7 @@ def sample_metadata(
for on, data in self._extra_metadata:
df_samples = df_samples.merge(data, how="left", on=on)
- # For convenience, apply a sample selection.
+ # Apply the sample_query or sample_indices, if specified.
if sample_query is not None:
# Assume a pandas query string.
sample_query_options = sample_query_options or {}
diff --git a/tests/anoph/conftest.py b/tests/anoph/conftest.py
index 9d41d0736..153e70c3d 100644
--- a/tests/anoph/conftest.py
+++ b/tests/anoph/conftest.py
@@ -1256,6 +1256,31 @@ def write_metadata(
dst_path.parent.mkdir(parents=True, exist_ok=True)
df_general_ds.to_csv(dst_path, index=False)
+ # Create surveillance flags by sample from real metadata files.
+ surv_flags_src_path = (
+ self.fixture_dir
+ / "vo_agam_release_master_us_central1"
+ / release_path
+ / "metadata"
+ / "general"
+ / sample_set
+ / "surveillance.flags.csv"
+ )
+ df_surveillance_flags = pd.read_csv(surv_flags_src_path)
+ df_surveillance_flags_ds = (
+ df_surveillance_flags.set_index("sample_id").loc[samples_ds].reset_index()
+ )
+ surv_flags_dst_path = (
+ self.bucket_path
+ / release_path
+ / "metadata"
+ / "general"
+ / sample_set
+ / "surveillance.flags.csv"
+ )
+ surv_flags_dst_path.parent.mkdir(parents=True, exist_ok=True)
+ df_surveillance_flags_ds.to_csv(surv_flags_dst_path, index=False)
+
if sequence_qc:
# Create sequence QC metadata by sample from real metadata files.
src_path = (
@@ -2010,6 +2035,31 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True):
dst_path.parent.mkdir(parents=True, exist_ok=True)
df_general_ds.to_csv(dst_path, index=False)
+ # Create surveillance flags by sample from real metadata files.
+ surv_flags_src_path = (
+ self.fixture_dir
+ / "vo_afun_release_master_us_central1"
+ / release_path
+ / "metadata"
+ / "general"
+ / sample_set
+ / "surveillance.flags.csv"
+ )
+ df_surveillance_flags = pd.read_csv(surv_flags_src_path)
+ df_surveillance_flags_ds = (
+ df_surveillance_flags.set_index("sample_id").loc[samples_ds].reset_index()
+ )
+ surv_flags_dst_path = (
+ self.bucket_path
+ / release_path
+ / "metadata"
+ / "general"
+ / sample_set
+ / "surveillance.flags.csv"
+ )
+ surv_flags_dst_path.parent.mkdir(parents=True, exist_ok=True)
+ df_surveillance_flags_ds.to_csv(surv_flags_dst_path, index=False)
+
if sequence_qc:
# Create sequence QC metadata by sample from real metadata files.
src_path = (
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1229-VO-GH-DADZIE-VMF00095/surveillance.flags.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1229-VO-GH-DADZIE-VMF00095/surveillance.flags.csv
new file mode 100644
index 000000000..90fcd5bb1
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1229-VO-GH-DADZIE-VMF00095/surveillance.flags.csv
@@ -0,0 +1,45 @@
+sample_id,is_surveillance
+VBS24195,True
+VBS24196,True
+VBS24197,True
+VBS24198,True
+VBS24199,True
+VBS24200,True
+VBS24201,True
+VBS24202,True
+VBS24203,True
+VBS24204,True
+VBS24205,True
+VBS24206,True
+VBS24207,True
+VBS24208,True
+VBS24209,True
+VBS24210,True
+VBS24213,True
+VBS24216,True
+VBS24217,True
+VBS24218,True
+VBS24221,True
+VBS24222,True
+VBS24223,True
+VBS24224,True
+VBS24225,True
+VBS24226,True
+VBS24227,True
+VBS24228,True
+VBS24229,True
+VBS24230,True
+VBS24231,True
+VBS24232,True
+VBS24233,True
+VBS24234,True
+VBS24235,True
+VBS24236,True
+VBS24237,True
+VBS24238,True
+VBS24239,True
+VBS24240,True
+VBS24241,True
+VBS24242,True
+VBS24243,True
+VBS24244,True
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1230-VO-GA-CF-AYALA-VMF00045/surveillance.flags.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1230-VO-GA-CF-AYALA-VMF00045/surveillance.flags.csv
new file mode 100644
index 000000000..4f56853eb
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1230-VO-GA-CF-AYALA-VMF00045/surveillance.flags.csv
@@ -0,0 +1,52 @@
+sample_id,is_surveillance
+VBS17729,True
+VBS17730,True
+VBS17731,True
+VBS17732,True
+VBS17733,True
+VBS17734,True
+VBS17735,True
+VBS17736,True
+VBS17737,True
+VBS17738,True
+VBS17739,True
+VBS17740,True
+VBS17741,True
+VBS17742,True
+VBS17743,True
+VBS17744,True
+VBS17745,True
+VBS17746,True
+VBS17747,True
+VBS17748,True
+VBS17749,True
+VBS17750,True
+VBS17751,True
+VBS17752,True
+VBS17753,True
+VBS17754,True
+VBS17755,True
+VBS17756,True
+VBS17757,True
+VBS17758,True
+VBS17759,True
+VBS17760,True
+VBS17761,True
+VBS17762,True
+VBS17763,True
+VBS17764,True
+VBS17765,True
+VBS17766,True
+VBS17767,True
+VBS17768,True
+VBS17770,True
+VBS17772,True
+VBS17777,True
+VBS17778,True
+VBS17780,True
+VBS17783,True
+VBS17784,True
+VBS17785,True
+VBS17786,True
+VBS17798,True
+VBS17801,True
diff --git a/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1231-VO-MULTI-WONDJI-VMF00043/surveillance.flags.csv b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1231-VO-MULTI-WONDJI-VMF00043/surveillance.flags.csv
new file mode 100644
index 000000000..ff1a3d224
--- /dev/null
+++ b/tests/anoph/fixture/vo_afun_release_master_us_central1/v1.0/metadata/general/1231-VO-MULTI-WONDJI-VMF00043/surveillance.flags.csv
@@ -0,0 +1,436 @@
+sample_id,is_surveillance
+VBS17190,True
+VBS17191,True
+VBS17192,True
+VBS17193,True
+VBS17194,True
+VBS17195,True
+VBS17196,True
+VBS17197,True
+VBS17198,True
+VBS17199,True
+VBS17200,True
+VBS17201,True
+VBS17202,True
+VBS17203,True
+VBS17204,True
+VBS17205,True
+VBS17206,True
+VBS17207,True
+VBS17208,True
+VBS17209,True
+VBS17210,True
+VBS17211,True
+VBS17212,True
+VBS17213,True
+VBS17214,True
+VBS17215,True
+VBS17216,True
+VBS17217,True
+VBS17218,True
+VBS17219,True
+VBS17220,True
+VBS17221,True
+VBS17222,True
+VBS17223,True
+VBS17224,True
+VBS17225,True
+VBS17226,True
+VBS17227,True
+VBS17228,True
+VBS17229,True
+VBS17230,True
+VBS17231,True
+VBS17232,True
+VBS17233,True
+VBS17234,True
+VBS17235,True
+VBS17236,True
+VBS17237,True
+VBS17238,True
+VBS17239,True
+VBS17240,True
+VBS17241,True
+VBS17242,True
+VBS17243,True
+VBS17244,True
+VBS17245,True
+VBS17246,True
+VBS17247,True
+VBS17248,True
+VBS17249,True
+VBS17250,True
+VBS17251,True
+VBS17252,True
+VBS17253,True
+VBS17254,True
+VBS17255,True
+VBS17256,True
+VBS17257,True
+VBS17258,True
+VBS17259,True
+VBS17260,True
+VBS17261,True
+VBS17262,True
+VBS17263,True
+VBS17264,True
+VBS17265,True
+VBS17266,True
+VBS17267,True
+VBS17268,True
+VBS17269,True
+VBS17270,True
+VBS17271,True
+VBS17272,True
+VBS17273,True
+VBS17274,True
+VBS17275,True
+VBS17276,True
+VBS17277,True
+VBS17278,True
+VBS17279,True
+VBS17280,True
+VBS17281,True
+VBS17282,True
+VBS17283,True
+VBS17284,True
+VBS17285,True
+VBS17286,True
+VBS17287,True
+VBS17288,True
+VBS17289,True
+VBS17290,True
+VBS17291,True
+VBS17292,True
+VBS17293,True
+VBS17294,True
+VBS17295,True
+VBS17296,True
+VBS17297,True
+VBS17298,True
+VBS17299,True
+VBS17300,True
+VBS17301,True
+VBS17302,True
+VBS17303,True
+VBS17304,True
+VBS17305,True
+VBS17306,True
+VBS17307,True
+VBS17308,True
+VBS17309,True
+VBS17310,True
+VBS17311,True
+VBS17312,True
+VBS17313,True
+VBS17314,True
+VBS17315,True
+VBS17316,True
+VBS17317,True
+VBS17318,True
+VBS17319,True
+VBS17320,True
+VBS17321,True
+VBS17322,True
+VBS17323,True
+VBS17324,True
+VBS17325,True
+VBS17326,True
+VBS17327,True
+VBS17328,True
+VBS17329,True
+VBS17330,True
+VBS17331,True
+VBS17332,True
+VBS17333,True
+VBS17334,True
+VBS17335,True
+VBS17336,True
+VBS17337,True
+VBS17338,True
+VBS17339,True
+VBS17340,True
+VBS17341,True
+VBS17342,True
+VBS17343,True
+VBS17344,True
+VBS17345,True
+VBS17346,True
+VBS17347,True
+VBS17348,True
+VBS17349,True
+VBS17350,True
+VBS17351,True
+VBS17352,True
+VBS17353,True
+VBS17354,True
+VBS17355,True
+VBS17356,True
+VBS17357,True
+VBS17358,True
+VBS17359,True
+VBS17360,True
+VBS17361,True
+VBS17362,True
+VBS17363,True
+VBS17364,True
+VBS17365,True
+VBS17366,True
+VBS17367,True
+VBS17368,True
+VBS17369,True
+VBS17370,True
+VBS17371,True
+VBS17372,True
+VBS17373,True
+VBS17374,True
+VBS17376,True
+VBS17378,True
+VBS17379,True
+VBS17380,True
+VBS17381,True
+VBS17382,True
+VBS17383,True
+VBS17384,True
+VBS17386,True
+VBS17387,True
+VBS17388,True
+VBS17389,True
+VBS17391,True
+VBS17392,True
+VBS17393,True
+VBS17394,True
+VBS17395,True
+VBS17396,True
+VBS17397,True
+VBS17398,True
+VBS17399,True
+VBS17400,True
+VBS17401,True
+VBS17402,True
+VBS17403,True
+VBS17404,True
+VBS17405,True
+VBS17406,True
+VBS17407,True
+VBS17408,True
+VBS17409,True
+VBS17410,True
+VBS17411,True
+VBS17412,True
+VBS17413,True
+VBS17414,True
+VBS17415,True
+VBS17416,True
+VBS17417,True
+VBS17418,True
+VBS17419,True
+VBS17420,True
+VBS17421,True
+VBS17422,True
+VBS17423,True
+VBS17424,True
+VBS17425,True
+VBS17426,True
+VBS17427,True
+VBS17428,True
+VBS17429,True
+VBS17430,True
+VBS17431,True
+VBS17432,True
+VBS17433,True
+VBS17434,True
+VBS17435,True
+VBS17436,True
+VBS17437,True
+VBS17438,True
+VBS17439,True
+VBS17440,True
+VBS17441,True
+VBS17442,True
+VBS17443,True
+VBS17444,True
+VBS17445,True
+VBS17446,True
+VBS17447,True
+VBS17448,True
+VBS17449,True
+VBS17450,True
+VBS17451,True
+VBS17452,True
+VBS17453,True
+VBS17454,True
+VBS17455,True
+VBS17456,True
+VBS17457,True
+VBS17458,True
+VBS17459,True
+VBS17460,True
+VBS17461,True
+VBS17462,True
+VBS17463,True
+VBS17464,True
+VBS17465,True
+VBS17466,True
+VBS17467,True
+VBS17468,True
+VBS17469,True
+VBS17470,True
+VBS17471,True
+VBS17472,True
+VBS17473,True
+VBS17474,True
+VBS17475,True
+VBS17476,True
+VBS17477,True
+VBS17478,True
+VBS17479,True
+VBS17480,True
+VBS17481,True
+VBS17482,True
+VBS17483,True
+VBS17484,True
+VBS17485,True
+VBS17486,True
+VBS17487,True
+VBS17488,True
+VBS17489,True
+VBS17490,True
+VBS17491,True
+VBS17492,True
+VBS17494,True
+VBS17495,True
+VBS17496,True
+VBS17497,True
+VBS17498,True
+VBS17499,True
+VBS17500,True
+VBS17501,True
+VBS17502,True
+VBS17503,True
+VBS17504,True
+VBS17505,True
+VBS17506,True
+VBS17507,True
+VBS17508,True
+VBS17509,True
+VBS17510,True
+VBS17511,True
+VBS17512,True
+VBS17514,True
+VBS17515,True
+VBS17516,True
+VBS17517,True
+VBS17518,True
+VBS17519,True
+VBS17520,True
+VBS17521,True
+VBS17522,True
+VBS17523,True
+VBS17524,True
+VBS17525,True
+VBS17526,True
+VBS17527,True
+VBS17528,True
+VBS17529,True
+VBS17530,True
+VBS17531,True
+VBS17532,True
+VBS17533,True
+VBS17534,True
+VBS17535,True
+VBS17536,True
+VBS17537,True
+VBS17538,True
+VBS17539,True
+VBS17540,True
+VBS17541,True
+VBS17542,True
+VBS17543,True
+VBS17544,True
+VBS17545,True
+VBS17546,True
+VBS17547,True
+VBS17548,True
+VBS17549,True
+VBS17550,True
+VBS17551,True
+VBS17552,True
+VBS17553,True
+VBS17554,True
+VBS17555,True
+VBS17556,True
+VBS17557,True
+VBS17558,True
+VBS17559,True
+VBS17560,True
+VBS17561,True
+VBS17562,True
+VBS17563,True
+VBS17564,True
+VBS17565,True
+VBS17566,True
+VBS17567,True
+VBS17568,True
+VBS17569,True
+VBS17570,True
+VBS17571,True
+VBS17572,True
+VBS17573,True
+VBS17574,True
+VBS17575,True
+VBS17576,True
+VBS17577,True
+VBS17578,True
+VBS17579,True
+VBS17580,True
+VBS17581,True
+VBS17582,True
+VBS17583,True
+VBS17584,True
+VBS17585,True
+VBS17586,True
+VBS17587,True
+VBS17588,True
+VBS17589,True
+VBS17590,True
+VBS17591,True
+VBS17592,True
+VBS17593,True
+VBS17594,True
+VBS17595,True
+VBS17596,True
+VBS17597,True
+VBS17598,True
+VBS17599,True
+VBS17600,True
+VBS17601,True
+VBS17602,True
+VBS17603,True
+VBS17604,True
+VBS17605,True
+VBS17606,True
+VBS17607,True
+VBS17608,True
+VBS17609,True
+VBS17610,True
+VBS17611,True
+VBS17612,True
+VBS17613,True
+VBS17614,True
+VBS17615,True
+VBS17616,True
+VBS17617,True
+VBS17618,True
+VBS17619,True
+VBS17620,True
+VBS17621,True
+VBS17622,True
+VBS17623,True
+VBS17624,True
+VBS17625,True
+VBS17626,True
+VBS17627,True
+VBS17628,True
+VBS17629,True
+VBS17630,True
diff --git a/tests/anoph/fixture/vo_agam_release_master_us_central1/v3.1/metadata/general/1177-VO-ML-LEHMANN-VMF00004/surveillance.flags.csv b/tests/anoph/fixture/vo_agam_release_master_us_central1/v3.1/metadata/general/1177-VO-ML-LEHMANN-VMF00004/surveillance.flags.csv
new file mode 100644
index 000000000..2a78c33e8
--- /dev/null
+++ b/tests/anoph/fixture/vo_agam_release_master_us_central1/v3.1/metadata/general/1177-VO-ML-LEHMANN-VMF00004/surveillance.flags.csv
@@ -0,0 +1,1304 @@
+sample_id,is_surveillance
+VBS00192-4651STDY7017182,True
+VBS00240-4651STDY7017183,True
+VBS00256-4651STDY7017184,True
+VBS00257-4651STDY7017185,True
+VBS00259-4651STDY7017186,True
+VBS00262-4651STDY7017187,True
+VBS00270-4651STDY7017188,True
+VBS00277-4651STDY7017189,True
+VBS00278-4651STDY7017190,True
+VBS00288-4651STDY7017191,True
+VBS00289-4651STDY7017192,True
+VBS00293-4651STDY7017193,True
+VBS00309-4651STDY7017194,True
+VBS00323-4651STDY7017195,True
+VBS00331-4651STDY7017196,True
+VBS00343-4651STDY7017197,True
+VBS00344-4651STDY7017198,True
+VBS00345-4651STDY7017199,True
+VBS00350-4651STDY7017200,True
+VBS00351-4651STDY7017201,True
+VBS00352-4651STDY7017202,True
+VBS00353-4651STDY7017203,True
+VBS00354-4651STDY7017204,True
+VBS00355-4651STDY7017205,True
+VBS00356-4651STDY7017206,True
+VBS00358-4651STDY7017207,True
+VBS00359-4651STDY7017208,True
+VBS00361-4651STDY7017209,True
+VBS00363-4651STDY7017210,True
+VBS00364-4651STDY7017211,True
+VBS00365-4651STDY7017212,True
+VBS00366-4651STDY7017213,True
+VBS00367-4651STDY7017214,True
+VBS00368-4651STDY7017215,True
+VBS00370-4651STDY7017216,True
+VBS00371-4651STDY7017217,True
+VBS00373-4651STDY7017218,True
+VBS00374-4651STDY7017219,True
+VBS00375-4651STDY7017220,True
+VBS00379-4651STDY7017221,True
+VBS00388-4651STDY7017222,True
+VBS00449-4651STDY7017223,True
+VBS00511-4651STDY7017224,True
+VBS00912-4651STDY7017225,True
+VBS00913-4651STDY7017226,True
+VBS00916-4651STDY7017227,True
+VBS00917-4651STDY7017228,True
+VBS00918-4651STDY7017229,True
+VBS00919-4651STDY7017230,True
+VBS00920-4651STDY7017231,True
+VBS00922-4651STDY7017232,True
+VBS00925-4651STDY7017233,True
+VBS00926-4651STDY7017234,True
+VBS00927-4651STDY7017235,True
+VBS00928-4651STDY7017236,True
+VBS00929-4651STDY7017237,True
+VBS00931-4651STDY7017238,True
+VBS00932-4651STDY7017239,True
+VBS00933-4651STDY7017240,True
+VBS00935-4651STDY7017241,True
+VBS00936-4651STDY7017242,True
+VBS00937-4651STDY7017243,True
+VBS00942-4651STDY7017244,True
+VBS00943-4651STDY7017245,True
+VBS00944-4651STDY7017246,True
+VBS00945-4651STDY7017247,True
+VBS00949-4651STDY7017248,True
+VBS00950-4651STDY7017249,True
+VBS00951-4651STDY7017250,True
+VBS00952-4651STDY7017251,True
+VBS00953-4651STDY7017252,True
+VBS00957-4651STDY7017253,True
+VBS00958-4651STDY7017254,True
+VBS00960-4651STDY7017255,True
+VBS00961-4651STDY7017256,True
+VBS00962-4651STDY7017257,True
+VBS00964-4651STDY7017258,True
+VBS00969-4651STDY7017259,True
+VBS00970-4651STDY7017260,True
+VBS00971-4651STDY7017261,True
+VBS00972-4651STDY7017262,True
+VBS00974-4651STDY7017263,True
+VBS00975-4651STDY7017264,True
+VBS00976-4651STDY7017265,True
+VBS00977-4651STDY7017266,True
+VBS00982-4651STDY7017267,True
+VBS00983-4651STDY7017268,True
+VBS00985-4651STDY7017269,True
+VBS00986-4651STDY7017270,True
+VBS00989-4651STDY7017271,True
+VBS00991-4651STDY7017272,True
+VBS00992-4651STDY7017273,True
+VBS00994-4651STDY7017274,True
+VBS00997-4651STDY7017275,True
+VBS00999-4651STDY7017278,True
+VBS01000-4651STDY7017279,True
+VBS01001-4651STDY7017280,True
+VBS01002-4651STDY7017281,True
+VBS01004-4651STDY7017282,True
+VBS01005-4651STDY7017283,True
+VBS01006-4651STDY7017284,True
+VBS01007-4651STDY7017285,True
+VBS01008-4651STDY7017286,True
+VBS01009-4651STDY7017287,True
+VBS01012-4651STDY7017288,True
+VBS01013-4651STDY7017289,True
+VBS01014-4651STDY7017290,True
+VBS01015-4651STDY7017291,True
+VBS01016-4651STDY7017292,True
+VBS01018-4651STDY7017293,True
+VBS01022-4651STDY7017294,True
+VBS01026-4651STDY7017295,True
+VBS01029-4651STDY7017296,True
+VBS01030-4651STDY7017297,True
+VBS01031-4651STDY7017298,True
+VBS01037-4651STDY7017299,True
+VBS01039-4651STDY7017300,True
+VBS01044-4651STDY7017301,True
+VBS01045-4651STDY7017302,True
+VBS01046-4651STDY7017303,True
+VBS01048-4651STDY7017304,True
+VBS01049-4651STDY7017305,True
+VBS01050-4651STDY7017306,True
+VBS01052-4651STDY7017307,True
+VBS01053-4651STDY7017308,True
+VBS01054-4651STDY7017309,True
+VBS01055-4651STDY7017310,True
+VBS01056-4651STDY7017311,True
+VBS01057-4651STDY7017312,True
+VBS01059-4651STDY7017313,True
+VBS01061-4651STDY7017314,True
+VBS01063-4651STDY7017315,True
+VBS01064-4651STDY7017316,True
+VBS01069-4651STDY7017317,True
+VBS01070-4651STDY7017318,True
+VBS01071-4651STDY7017319,True
+VBS01072-4651STDY7017320,True
+VBS01073-4651STDY7017321,True
+VBS01074-4651STDY7017322,True
+VBS01075-4651STDY7017323,True
+VBS01078-4651STDY7017324,True
+VBS01079-4651STDY7017325,True
+VBS01080-4651STDY7017326,True
+VBS01081-4651STDY7017327,True
+VBS01082-4651STDY7017328,True
+VBS01085-4651STDY7017329,True
+VBS01086-4651STDY7017330,True
+VBS01087-4651STDY7017331,True
+VBS01089-4651STDY7017332,True
+VBS01093-4651STDY7017333,True
+VBS01102-4651STDY7017334,True
+VBS01107-4651STDY7017335,True
+VBS01108-4651STDY7017336,True
+VBS01109-4651STDY7017337,True
+VBS01111-4651STDY7017338,True
+VBS01112-4651STDY7017339,True
+VBS01115-4651STDY7017340,True
+VBS01116-4651STDY7017341,True
+VBS01123-4651STDY7017342,True
+VBS01124-4651STDY7017343,True
+VBS01125-4651STDY7017344,True
+VBS01126-4651STDY7017345,True
+VBS01129-4651STDY7017346,True
+VBS01131-4651STDY7017347,True
+VBS01133-4651STDY7017348,True
+VBS01138-4651STDY7017349,True
+VBS01139-4651STDY7017350,True
+VBS01140-4651STDY7017351,True
+VBS01141-4651STDY7017352,True
+VBS01142-4651STDY7017353,True
+VBS01144-4651STDY7017354,True
+VBS01149-4651STDY7017355,True
+VBS01150-4651STDY7017356,True
+VBS01152-4651STDY7017357,True
+VBS01158-4651STDY7017358,True
+VBS01161-4651STDY7017359,True
+VBS01162-4651STDY7017360,True
+VBS01163-4651STDY7017361,True
+VBS01164-4651STDY7017362,True
+VBS01165-4651STDY7017363,True
+VBS01166-4651STDY7017364,True
+VBS01170-4651STDY7017365,True
+VBS01171-4651STDY7017366,True
+VBS01173-4651STDY7017367,True
+VBS01174-4651STDY7017368,True
+VBS01175-4651STDY7017369,True
+VBS01178-4651STDY7017370,True
+VBS01179-4651STDY7017371,True
+VBS01180-4651STDY7017374,True
+VBS01182-4651STDY7017375,True
+VBS01184-4651STDY7017376,True
+VBS01187-4651STDY7017377,True
+VBS01188-4651STDY7017378,True
+VBS01189-4651STDY7017379,True
+VBS01191-4651STDY7017380,True
+VBS01194-4651STDY7017381,True
+VBS01195-4651STDY7017382,True
+VBS01197-4651STDY7017383,True
+VBS01199-4651STDY7017384,True
+VBS01200-4651STDY7017385,True
+VBS01201-4651STDY7017386,True
+VBS01202-4651STDY7017387,True
+VBS01203-4651STDY7017388,True
+VBS01204-4651STDY7017389,True
+VBS01205-4651STDY7017390,True
+VBS01206-4651STDY7017391,True
+VBS01207-4651STDY7017392,True
+VBS01208-4651STDY7017393,True
+VBS01209-4651STDY7017394,True
+VBS01210-4651STDY7017395,True
+VBS01211-4651STDY7017396,True
+VBS01212-4651STDY7017397,True
+VBS01213-4651STDY7017398,True
+VBS01215-4651STDY7017399,True
+VBS01216-4651STDY7017400,True
+VBS01218-4651STDY7017401,True
+VBS01219-4651STDY7017402,True
+VBS01220-4651STDY7017403,True
+VBS01222-4651STDY7017404,True
+VBS01223-4651STDY7017405,True
+VBS01224-4651STDY7017406,True
+VBS01226-4651STDY7017407,True
+VBS01227-4651STDY7017408,True
+VBS01228-4651STDY7017409,True
+VBS01229-4651STDY7017410,True
+VBS01230-4651STDY7017411,True
+VBS01231-4651STDY7017412,True
+VBS01232-4651STDY7017413,True
+VBS01233-4651STDY7017414,True
+VBS01234-4651STDY7017415,True
+VBS01235-4651STDY7017416,True
+VBS01236-4651STDY7017417,True
+VBS01237-4651STDY7017418,True
+VBS01238-4651STDY7017419,True
+VBS01239-4651STDY7017420,True
+VBS01240-4651STDY7017421,True
+VBS01241-4651STDY7017422,True
+VBS01242-4651STDY7017423,True
+VBS01244-4651STDY7017424,True
+VBS01245-4651STDY7017425,True
+VBS01246-4651STDY7017426,True
+VBS01247-4651STDY7017427,True
+VBS01248-4651STDY7017428,True
+VBS01250-4651STDY7017429,True
+VBS01251-4651STDY7017430,True
+VBS01252-4651STDY7017431,True
+VBS01253-4651STDY7017432,True
+VBS01254-4651STDY7017433,True
+VBS01256-4651STDY7017434,True
+VBS01257-4651STDY7017435,True
+VBS01258-4651STDY7017436,True
+VBS01259-4651STDY7017437,True
+VBS01261-4651STDY7017438,True
+VBS01262-4651STDY7017439,True
+VBS01265-4651STDY7017440,True
+VBS01266-4651STDY7017441,True
+VBS01267-4651STDY7017442,True
+VBS01268-4651STDY7017443,True
+VBS01269-4651STDY7017444,True
+VBS01271-4651STDY7017445,True
+VBS01272-4651STDY7017446,True
+VBS01273-4651STDY7017447,True
+VBS01274-4651STDY7017448,True
+VBS01275-4651STDY7017449,True
+VBS01276-4651STDY7017450,True
+VBS01277-4651STDY7017451,True
+VBS01278-4651STDY7017452,True
+VBS01279-4651STDY7017453,True
+VBS01280-4651STDY7017454,True
+VBS01291-4651STDY7017455,True
+VBS01296-4651STDY7017456,True
+VBS01297-4651STDY7017457,True
+VBS01303-4651STDY7017458,True
+VBS01304-4651STDY7017459,True
+VBS01314-4651STDY7017460,True
+VBS01315-4651STDY7017461,True
+VBS01316-4651STDY7017462,True
+VBS01317-4651STDY7017463,True
+VBS01320-4651STDY7017464,True
+VBS01322-4651STDY7017465,True
+VBS01323-4651STDY7017466,True
+VBS01328-4651STDY7017467,True
+VBS01329-4651STDY7017470,True
+VBS01330-4651STDY7017471,True
+VBS01335-4651STDY7017472,True
+VBS01336-4651STDY7017473,True
+VBS01338-4651STDY7017474,True
+VBS01340-4651STDY7017475,True
+VBS01341-4651STDY7017476,True
+VBS01344-4651STDY7017477,True
+VBS01345-4651STDY7017478,True
+VBS01346-4651STDY7017479,True
+VBS01347-4651STDY7017480,True
+VBS01348-4651STDY7017481,True
+VBS01350-4651STDY7017482,True
+VBS01351-4651STDY7017483,True
+VBS01352-4651STDY7017484,True
+VBS01353-4651STDY7017485,True
+VBS01354-4651STDY7017486,True
+VBS01356-4651STDY7017487,True
+VBS01358-4651STDY7017488,True
+VBS01361-4651STDY7017489,True
+VBS01362-4651STDY7017490,True
+VBS01363-4651STDY7017491,True
+VBS01365-4651STDY7017492,True
+VBS01366-4651STDY7017493,True
+VBS01367-4651STDY7017494,True
+VBS01368-4651STDY7017495,True
+VBS01369-4651STDY7017496,True
+VBS01370-4651STDY7017497,True
+VBS01372-4651STDY7017498,True
+VBS01382-4651STDY7017499,True
+VBS01384-4651STDY7017500,True
+VBS01389-4651STDY7017501,True
+VBS01390-4651STDY7017502,True
+VBS01391-4651STDY7017503,True
+VBS01392-4651STDY7017504,True
+VBS01397-4651STDY7017505,True
+VBS01398-4651STDY7017506,True
+VBS01399-4651STDY7017507,True
+VBS01400-4651STDY7017508,True
+VBS01401-4651STDY7017509,True
+VBS01405-4651STDY7017510,True
+VBS01406-4651STDY7017511,True
+VBS01407-4651STDY7017512,True
+VBS01408-4651STDY7017513,True
+VBS01412-4651STDY7017514,True
+VBS01413-4651STDY7017515,True
+VBS01414-4651STDY7017516,True
+VBS01415-4651STDY7017517,True
+VBS01416-4651STDY7017518,True
+VBS01418-4651STDY7017519,True
+VBS01420-4651STDY7017520,True
+VBS01421-4651STDY7017521,True
+VBS01428-4651STDY7017522,True
+VBS01430-4651STDY7017523,True
+VBS01431-4651STDY7017524,True
+VBS01432-4651STDY7017525,True
+VBS01433-4651STDY7017526,True
+VBS01434-4651STDY7017527,True
+VBS01435-4651STDY7017528,True
+VBS01436-4651STDY7017529,True
+VBS01438-4651STDY7017530,True
+VBS01439-4651STDY7017531,True
+VBS01442-4651STDY7017532,True
+VBS01444-4651STDY7017533,True
+VBS01445-4651STDY7017534,True
+VBS01446-4651STDY7017535,True
+VBS01447-4651STDY7017536,True
+VBS01448-4651STDY7017537,True
+VBS01451-4651STDY7017538,True
+VBS01452-4651STDY7017539,True
+VBS01454-4651STDY7017540,True
+VBS01456-4651STDY7017541,True
+VBS01457-4651STDY7017542,True
+VBS01459-4651STDY7017543,True
+VBS01462-4651STDY7017544,True
+VBS01463-4651STDY7017545,True
+VBS01464-4651STDY7017546,True
+VBS01465-4651STDY7017547,True
+VBS01466-4651STDY7017548,True
+VBS01467-4651STDY7017549,True
+VBS01470-4651STDY7017550,True
+VBS01472-4651STDY7017551,True
+VBS01473-4651STDY7017552,True
+VBS01475-4651STDY7017553,True
+VBS01476-4651STDY7017554,True
+VBS01477-4651STDY7017555,True
+VBS01478-4651STDY7017556,True
+VBS01484-4651STDY7017557,True
+VBS01492-4651STDY7017558,True
+VBS01503-4651STDY7017559,True
+VBS01506-4651STDY7017560,True
+VBS01508-4651STDY7017561,True
+VBS01509-4651STDY7017562,True
+VBS01510-4651STDY7017563,True
+VBS01511-4651STDY7017566,True
+VBS01513-4651STDY7017567,True
+VBS01516-4651STDY7017568,True
+VBS01517-4651STDY7017569,True
+VBS01518-4651STDY7017570,True
+VBS01519-4651STDY7017571,True
+VBS01520-4651STDY7017572,True
+VBS01521-4651STDY7017573,True
+VBS01522-4651STDY7017574,True
+VBS01524-4651STDY7017575,True
+VBS01525-4651STDY7017576,True
+VBS01526-4651STDY7017577,True
+VBS01527-4651STDY7017578,True
+VBS01528-4651STDY7017579,True
+VBS01530-4651STDY7017580,True
+VBS01532-4651STDY7017581,True
+VBS01533-4651STDY7017582,True
+VBS01534-4651STDY7017583,True
+VBS01535-4651STDY7017584,True
+VBS01536-4651STDY7017585,True
+VBS01537-4651STDY7017586,True
+VBS01538-4651STDY7017587,True
+VBS01540-4651STDY7017588,True
+VBS01541-4651STDY7017589,True
+VBS01542-4651STDY7017590,True
+VBS01543-4651STDY7017591,True
+VBS01544-4651STDY7017592,True
+VBS01546-4651STDY7017593,True
+VBS01548-4651STDY7017594,True
+VBS01549-4651STDY7017595,True
+VBS01551-4651STDY7017596,True
+VBS01552-4651STDY7017597,True
+VBS01553-4651STDY7017598,True
+VBS01554-4651STDY7017599,True
+VBS01556-4651STDY7017600,True
+VBS01557-4651STDY7017601,True
+VBS01558-4651STDY7017602,True
+VBS01562-4651STDY7017603,True
+VBS01564-4651STDY7017604,True
+VBS01565-4651STDY7017605,True
+VBS01566-4651STDY7017606,True
+VBS01569-4651STDY7017607,True
+VBS01570-4651STDY7017608,True
+VBS01571-4651STDY7017609,True
+VBS01572-4651STDY7017610,True
+VBS01574-4651STDY7017611,True
+VBS01575-4651STDY7017612,True
+VBS01576-4651STDY7017613,True
+VBS01578-4651STDY7017614,True
+VBS01579-4651STDY7017615,True
+VBS01580-4651STDY7017616,True
+VBS01583-4651STDY7017617,True
+VBS01584-4651STDY7017618,True
+VBS01586-4651STDY7017619,True
+VBS01587-4651STDY7017620,True
+VBS01590-4651STDY7017621,True
+VBS01591-4651STDY7017622,True
+VBS01592-4651STDY7017623,True
+VBS01594-4651STDY7017624,True
+VBS01595-4651STDY7017625,True
+VBS01596-4651STDY7017626,True
+VBS01597-4651STDY7017627,True
+VBS01603-4651STDY7017628,True
+VBS01606-4651STDY7017629,True
+VBS01608-4651STDY7017630,True
+VBS01610-4651STDY7017631,True
+VBS01615-4651STDY7017632,True
+VBS01626-4651STDY7017633,True
+VBS01631-4651STDY7017634,True
+VBS01636-4651STDY7017635,True
+VBS01637-4651STDY7017636,True
+VBS01640-4651STDY7017637,True
+VBS01642-4651STDY7017638,True
+VBS01643-4651STDY7017639,True
+VBS01647-4651STDY7017640,True
+VBS01651-4651STDY7017641,True
+VBS01656-4651STDY7017642,True
+VBS01658-4651STDY7017643,True
+VBS01659-4651STDY7017644,True
+VBS01664-4651STDY7017645,True
+VBS01669-4651STDY7017646,True
+VBS01670-4651STDY7017647,True
+VBS01672-4651STDY7017648,True
+VBS01673-4651STDY7017649,True
+VBS01674-4651STDY7017650,True
+VBS01678-4651STDY7017651,True
+VBS01679-4651STDY7017652,True
+VBS01680-4651STDY7017653,True
+VBS01681-4651STDY7017654,True
+VBS01688-4651STDY7017655,True
+VBS01689-4651STDY7017656,True
+VBS01690-4651STDY7017657,True
+VBS01693-4651STDY7017658,True
+VBS01695-4651STDY7017659,True
+VBS01696-4651STDY7017662,True
+VBS01697-4651STDY7017663,True
+VBS01698-4651STDY7017664,True
+VBS01701-4651STDY7017665,True
+VBS01702-4651STDY7017666,True
+VBS01704-4651STDY7017667,True
+VBS01705-4651STDY7017668,True
+VBS01708-4651STDY7017669,True
+VBS01709-4651STDY7017670,True
+VBS01710-4651STDY7017671,True
+VBS01712-4651STDY7017672,True
+VBS01713-4651STDY7017673,True
+VBS01714-4651STDY7017674,True
+VBS01715-4651STDY7017675,True
+VBS01721-4651STDY7017676,True
+VBS01722-4651STDY7017677,True
+VBS01723-4651STDY7017678,True
+VBS01726-4651STDY7017679,True
+VBS01727-4651STDY7017680,True
+VBS01728-4651STDY7017681,True
+VBS01729-4651STDY7017682,True
+VBS01730-4651STDY7017683,True
+VBS01734-4651STDY7017684,True
+VBS01735-4651STDY7017685,True
+VBS01736-4651STDY7017686,True
+VBS01737-4651STDY7017687,True
+VBS01740-4651STDY7017688,True
+VBS01742-4651STDY7017689,True
+VBS01743-4651STDY7017690,True
+VBS01744-4651STDY7017691,True
+VBS01746-4651STDY7017692,True
+VBS01747-4651STDY7017693,True
+VBS01749-4651STDY7017694,True
+VBS01750-4651STDY7017695,True
+VBS01751-4651STDY7017696,True
+VBS01752-4651STDY7017697,True
+VBS01753-4651STDY7017698,True
+VBS01754-4651STDY7017699,True
+VBS01755-4651STDY7017700,True
+VBS01756-4651STDY7017701,True
+VBS01758-4651STDY7017702,True
+VBS01759-4651STDY7017703,True
+VBS01760-4651STDY7017704,True
+VBS01761-4651STDY7017705,True
+VBS01762-4651STDY7017706,True
+VBS01763-4651STDY7017707,True
+VBS01764-4651STDY7017708,True
+VBS01766-4651STDY7017709,True
+VBS01767-4651STDY7017710,True
+VBS01768-4651STDY7017711,True
+VBS01769-4651STDY7017712,True
+VBS01770-4651STDY7017713,True
+VBS01771-4651STDY7017714,True
+VBS01772-4651STDY7017715,True
+VBS01773-4651STDY7017716,True
+VBS01774-4651STDY7017717,True
+VBS01775-4651STDY7017718,True
+VBS01776-4651STDY7017719,True
+VBS01777-4651STDY7017720,True
+VBS01778-4651STDY7017721,True
+VBS01779-4651STDY7017722,True
+VBS01780-4651STDY7017723,True
+VBS01781-4651STDY7017724,True
+VBS01782-4651STDY7017725,True
+VBS01783-4651STDY7017726,True
+VBS01784-4651STDY7017727,True
+VBS01785-4651STDY7017728,True
+VBS01786-4651STDY7017729,True
+VBS01787-4651STDY7017730,True
+VBS01788-4651STDY7017731,True
+VBS01789-4651STDY7017732,True
+VBS01790-4651STDY7017733,True
+VBS01791-4651STDY7017734,True
+VBS01792-4651STDY7017735,True
+VBS01793-4651STDY7017736,True
+VBS01794-4651STDY7017737,True
+VBS01795-4651STDY7017738,True
+VBS01796-4651STDY7017739,True
+VBS01797-4651STDY7017740,True
+VBS01798-4651STDY7017741,True
+VBS01799-4651STDY7017742,True
+VBS01801-4651STDY7017743,True
+VBS01802-4651STDY7017744,True
+VBS01803-4651STDY7017745,True
+VBS01804-4651STDY7017746,True
+VBS01805-4651STDY7017747,True
+VBS01806-4651STDY7017748,True
+VBS01807-4651STDY7017749,True
+VBS01808-4651STDY7017750,True
+VBS01809-4651STDY7017751,True
+VBS01810-4651STDY7017752,True
+VBS01811-4651STDY7017753,True
+VBS01812-4651STDY7017754,True
+VBS01813-4651STDY7017755,True
+VBS01814-4651STDY7017758,True
+VBS01815-4651STDY7017759,True
+VBS01816-4651STDY7017760,True
+VBS01817-4651STDY7017761,True
+VBS01818-4651STDY7017762,True
+VBS01819-4651STDY7017763,True
+VBS01820-4651STDY7017764,True
+VBS01821-4651STDY7017765,True
+VBS01822-4651STDY7017766,True
+VBS01823-4651STDY7017767,True
+VBS01824-4651STDY7017768,True
+VBS01825-4651STDY7017769,True
+VBS01826-4651STDY7017770,True
+VBS01827-4651STDY7017771,True
+VBS01828-4651STDY7017772,True
+VBS01829-4651STDY7017773,True
+VBS01830-4651STDY7017774,True
+VBS01831-4651STDY7017775,True
+VBS01832-4651STDY7017776,True
+VBS01833-4651STDY7017777,True
+VBS01834-4651STDY7017778,True
+VBS01835-4651STDY7017779,True
+VBS01836-4651STDY7017780,True
+VBS01838-4651STDY7017781,True
+VBS01839-4651STDY7017782,True
+VBS01840-4651STDY7017783,True
+VBS01841-4651STDY7017784,True
+VBS01842-4651STDY7017785,True
+VBS01843-4651STDY7017786,True
+VBS01844-4651STDY7017787,True
+VBS01846-4651STDY7017788,True
+VBS01847-4651STDY7017789,True
+VBS01848-4651STDY7017790,True
+VBS01849-4651STDY7017791,True
+VBS01850-4651STDY7017792,True
+VBS01851-4651STDY7017793,True
+VBS01852-4651STDY7017794,True
+VBS01853-4651STDY7017795,True
+VBS01858-4651STDY7017796,True
+VBS01860-4651STDY7017797,True
+VBS01861-4651STDY7017798,True
+VBS01862-4651STDY7017799,True
+VBS01864-4651STDY7017800,True
+VBS01865-4651STDY7017801,True
+VBS01866-4651STDY7017802,True
+VBS01868-4651STDY7017803,True
+VBS01869-4651STDY7017804,True
+VBS01870-4651STDY7017805,True
+VBS01871-4651STDY7017806,True
+VBS01873-4651STDY7017807,True
+VBS01874-4651STDY7017808,True
+VBS01876-4651STDY7017809,True
+VBS01877-4651STDY7017810,True
+VBS01878-4651STDY7017811,True
+VBS01879-4651STDY7017812,True
+VBS01882-4651STDY7017813,True
+VBS01884-4651STDY7017814,True
+VBS01885-4651STDY7017815,True
+VBS01886-4651STDY7017816,True
+VBS01887-4651STDY7017817,True
+VBS01888-4651STDY7017818,True
+VBS01889-4651STDY7017819,True
+VBS01890-4651STDY7017820,True
+VBS01892-4651STDY7017821,True
+VBS01893-4651STDY7017822,True
+VBS01896-4651STDY7017823,True
+VBS01897-4651STDY7017824,True
+VBS01898-4651STDY7017825,True
+VBS01899-4651STDY7017826,True
+VBS01900-4651STDY7017827,True
+VBS01901-4651STDY7017828,True
+VBS01902-4651STDY7017829,True
+VBS01903-4651STDY7017830,True
+VBS01904-4651STDY7017831,True
+VBS01905-4651STDY7017832,True
+VBS01906-4651STDY7017833,True
+VBS01907-4651STDY7017834,True
+VBS01908-4651STDY7017835,True
+VBS01909-4651STDY7017836,True
+VBS01910-4651STDY7017837,True
+VBS01911-4651STDY7017838,True
+VBS01912-4651STDY7017839,True
+VBS01913-4651STDY7017840,True
+VBS01914-4651STDY7017841,True
+VBS01915-4651STDY7017842,True
+VBS01916-4651STDY7017843,True
+VBS01917-4651STDY7017844,True
+VBS01918-4651STDY7017845,True
+VBS01919-4651STDY7017846,True
+VBS01920-4651STDY7017847,True
+VBS01921-4651STDY7017848,True
+VBS01922-4651STDY7017849,True
+VBS01924-4651STDY7017850,True
+VBS01925-4651STDY7017851,True
+VBS01926-4651STDY7017854,True
+VBS01927-4651STDY7017855,True
+VBS01928-4651STDY7017856,True
+VBS01929-4651STDY7017857,True
+VBS01930-4651STDY7017858,True
+VBS01931-4651STDY7017859,True
+VBS01932-4651STDY7017860,True
+VBS01933-4651STDY7017861,True
+VBS01934-4651STDY7017862,True
+VBS01935-4651STDY7017863,True
+VBS01937-4651STDY7017864,True
+VBS01938-4651STDY7017865,True
+VBS01939-4651STDY7017866,True
+VBS01940-4651STDY7017867,True
+VBS01941-4651STDY7017868,True
+VBS01942-4651STDY7017869,True
+VBS01943-4651STDY7017870,True
+VBS01944-4651STDY7017871,True
+VBS01945-4651STDY7017872,True
+VBS01946-4651STDY7017873,True
+VBS01947-4651STDY7017874,True
+VBS01948-4651STDY7017875,True
+VBS01949-4651STDY7017876,True
+VBS01950-4651STDY7017877,True
+VBS01951-4651STDY7017878,True
+VBS01952-4651STDY7017879,True
+VBS01953-4651STDY7017880,True
+VBS01954-4651STDY7017881,True
+VBS01955-4651STDY7017882,True
+VBS01956-4651STDY7017883,True
+VBS01957-4651STDY7017884,True
+VBS01958-4651STDY7017885,True
+VBS01959-4651STDY7017886,True
+VBS01960-4651STDY7017887,True
+VBS01961-4651STDY7017888,True
+VBS01962-4651STDY7017889,True
+VBS01963-4651STDY7017890,True
+VBS01964-4651STDY7017891,True
+VBS01965-4651STDY7017892,True
+VBS01966-4651STDY7017893,True
+VBS01967-4651STDY7017894,True
+VBS01968-4651STDY7017895,True
+VBS01970-4651STDY7017896,True
+VBS01971-4651STDY7017897,True
+VBS01972-4651STDY7017898,True
+VBS01973-4651STDY7017899,True
+VBS01974-4651STDY7017900,True
+VBS01975-4651STDY7017901,True
+VBS01976-4651STDY7017902,True
+VBS01978-4651STDY7017903,True
+VBS01979-4651STDY7017904,True
+VBS01980-4651STDY7017905,True
+VBS01981-4651STDY7017906,True
+VBS01982-4651STDY7017907,True
+VBS01983-4651STDY7017908,True
+VBS01984-4651STDY7017909,True
+VBS01985-4651STDY7017910,True
+VBS01986-4651STDY7017911,True
+VBS01987-4651STDY7017912,True
+VBS01988-4651STDY7017913,True
+VBS01989-4651STDY7017914,True
+VBS01990-4651STDY7017915,True
+VBS01991-4651STDY7017916,True
+VBS01992-4651STDY7017917,True
+VBS01994-4651STDY7017918,True
+VBS01995-4651STDY7017919,True
+VBS01996-4651STDY7017920,True
+VBS01997-4651STDY7017921,True
+VBS01998-4651STDY7017922,True
+VBS01999-4651STDY7017923,True
+VBS02000-4651STDY7017924,True
+VBS02001-4651STDY7017925,True
+VBS02002-4651STDY7017926,True
+VBS06347-4651STDY7017927,True
+VBS00161-4651STDY7016510,True
+VBS00162-4651STDY7016511,True
+VBS00163-4651STDY7016512,True
+VBS00164-4651STDY7016513,True
+VBS00165-4651STDY7016514,True
+VBS00167-4651STDY7016515,True
+VBS00168-4651STDY7016516,True
+VBS00169-4651STDY7016517,True
+VBS00170-4651STDY7016518,True
+VBS00171-4651STDY7016519,True
+VBS00172-4651STDY7016520,True
+VBS00174-4651STDY7016521,True
+VBS00176-4651STDY7016522,True
+VBS00177-4651STDY7016523,True
+VBS00178-4651STDY7016524,True
+VBS00179-4651STDY7016525,True
+VBS00180-4651STDY7016526,True
+VBS00181-4651STDY7016527,True
+VBS00182-4651STDY7016528,True
+VBS00183-4651STDY7016529,True
+VBS00184-4651STDY7016530,True
+VBS00185-4651STDY7016531,True
+VBS00186-4651STDY7016532,True
+VBS00187-4651STDY7016533,True
+VBS00188-4651STDY7016534,True
+VBS00189-4651STDY7016535,True
+VBS00190-4651STDY7016536,True
+VBS00191-4651STDY7016537,True
+VBS00193-4651STDY7016538,True
+VBS00194-4651STDY7016539,True
+VBS00195-4651STDY7016540,True
+VBS00197-4651STDY7016541,True
+VBS00199-4651STDY7016542,True
+VBS00200-4651STDY7016543,True
+VBS00201-4651STDY7016544,True
+VBS00202-4651STDY7016545,True
+VBS00203-4651STDY7016546,True
+VBS00204-4651STDY7016547,True
+VBS00205-4651STDY7016548,True
+VBS00206-4651STDY7016549,True
+VBS00207-4651STDY7016550,True
+VBS00208-4651STDY7016551,True
+VBS00210-4651STDY7016552,True
+VBS00211-4651STDY7016553,True
+VBS00212-4651STDY7016554,True
+VBS00213-4651STDY7016555,True
+VBS00214-4651STDY7016556,True
+VBS00215-4651STDY7016557,True
+VBS00216-4651STDY7016558,True
+VBS00217-4651STDY7016559,True
+VBS00218-4651STDY7016560,True
+VBS00219-4651STDY7016561,True
+VBS00220-4651STDY7016562,True
+VBS00221-4651STDY7016563,True
+VBS00222-4651STDY7016564,True
+VBS00223-4651STDY7016565,True
+VBS00224-4651STDY7016566,True
+VBS00225-4651STDY7016567,True
+VBS00226-4651STDY7016568,True
+VBS00227-4651STDY7016569,True
+VBS00228-4651STDY7016570,True
+VBS00230-4651STDY7016571,True
+VBS00231-4651STDY7016572,True
+VBS00232-4651STDY7016573,True
+VBS00234-4651STDY7016574,True
+VBS00238-4651STDY7016575,True
+VBS00240-4651STDY7016576,True
+VBS00249-4651STDY7016577,True
+VBS00251-4651STDY7016578,True
+VBS00252-4651STDY7016579,True
+VBS00260-4651STDY7016580,True
+VBS00269-4651STDY7016581,True
+VBS00280-4651STDY7016582,True
+VBS00282-4651STDY7016583,True
+VBS00285-4651STDY7016584,True
+VBS00290-4651STDY7016585,True
+VBS00291-4651STDY7016586,True
+VBS00295-4651STDY7016587,True
+VBS00296-4651STDY7016588,True
+VBS00297-4651STDY7016589,True
+VBS00298-4651STDY7016590,True
+VBS00299-4651STDY7016591,True
+VBS00302-4651STDY7016592,True
+VBS00307-4651STDY7016593,True
+VBS00308-4651STDY7016594,True
+VBS00310-4651STDY7016595,True
+VBS00311-4651STDY7016596,True
+VBS00312-4651STDY7016597,True
+VBS00313-4651STDY7016598,True
+VBS00320-4651STDY7016599,True
+VBS00321-4651STDY7016600,True
+VBS00322-4651STDY7016601,True
+VBS00328-4651STDY7016602,True
+VBS00334-4651STDY7016603,True
+VBS00335-4651STDY7016606,True
+VBS00338-4651STDY7016607,True
+VBS00339-4651STDY7016608,True
+VBS00340-4651STDY7016609,True
+VBS00341-4651STDY7016610,True
+VBS00342-4651STDY7016611,True
+VBS00346-4651STDY7016612,True
+VBS00347-4651STDY7016613,True
+VBS00348-4651STDY7016614,True
+VBS00349-4651STDY7016615,True
+VBS00357-4651STDY7016616,True
+VBS00360-4651STDY7016617,True
+VBS00362-4651STDY7016618,True
+VBS00369-4651STDY7016619,True
+VBS00376-4651STDY7016620,True
+VBS00377-4651STDY7016621,True
+VBS00378-4651STDY7016622,True
+VBS00380-4651STDY7016623,True
+VBS00381-4651STDY7016624,True
+VBS00382-4651STDY7016625,True
+VBS00383-4651STDY7016626,True
+VBS00384-4651STDY7016627,True
+VBS00385-4651STDY7016628,True
+VBS00386-4651STDY7016629,True
+VBS00387-4651STDY7016630,True
+VBS00389-4651STDY7016631,True
+VBS00390-4651STDY7016632,True
+VBS00391-4651STDY7016633,True
+VBS00392-4651STDY7016634,True
+VBS00393-4651STDY7016635,True
+VBS00394-4651STDY7016636,True
+VBS00395-4651STDY7016637,True
+VBS00396-4651STDY7016638,True
+VBS00397-4651STDY7016639,True
+VBS00398-4651STDY7016640,True
+VBS00399-4651STDY7016641,True
+VBS00400-4651STDY7016642,True
+VBS00401-4651STDY7016643,True
+VBS00402-4651STDY7016644,True
+VBS00403-4651STDY7016645,True
+VBS00404-4651STDY7016646,True
+VBS00405-4651STDY7016647,True
+VBS00406-4651STDY7016648,True
+VBS00407-4651STDY7016649,True
+VBS00408-4651STDY7016650,True
+VBS00409-4651STDY7016651,True
+VBS00410-4651STDY7016652,True
+VBS00411-4651STDY7016653,True
+VBS00412-4651STDY7016654,True
+VBS00413-4651STDY7016655,True
+VBS00414-4651STDY7016656,True
+VBS00415-4651STDY7016657,True
+VBS00416-4651STDY7016658,True
+VBS00417-4651STDY7016659,True
+VBS00418-4651STDY7016660,True
+VBS00419-4651STDY7016661,True
+VBS00420-4651STDY7016662,True
+VBS00421-4651STDY7016663,True
+VBS00422-4651STDY7016664,True
+VBS00423-4651STDY7016665,True
+VBS00424-4651STDY7016666,True
+VBS00425-4651STDY7016667,True
+VBS00426-4651STDY7016668,True
+VBS00427-4651STDY7016669,True
+VBS00428-4651STDY7016670,True
+VBS00429-4651STDY7016671,True
+VBS00430-4651STDY7016672,True
+VBS00431-4651STDY7016673,True
+VBS00432-4651STDY7016674,True
+VBS00433-4651STDY7016675,True
+VBS00434-4651STDY7016676,True
+VBS00435-4651STDY7016677,True
+VBS00436-4651STDY7016678,True
+VBS00437-4651STDY7016679,True
+VBS00438-4651STDY7016680,True
+VBS00439-4651STDY7016681,True
+VBS00440-4651STDY7016682,True
+VBS00441-4651STDY7016683,True
+VBS00442-4651STDY7016684,True
+VBS00443-4651STDY7016685,True
+VBS00444-4651STDY7016686,True
+VBS00445-4651STDY7016687,True
+VBS00446-4651STDY7016688,True
+VBS00448-4651STDY7016689,True
+VBS00450-4651STDY7016690,True
+VBS00451-4651STDY7016691,True
+VBS00452-4651STDY7016692,True
+VBS00453-4651STDY7016693,True
+VBS00454-4651STDY7016694,True
+VBS00455-4651STDY7016695,True
+VBS00456-4651STDY7016696,True
+VBS00457-4651STDY7016697,True
+VBS00458-4651STDY7016698,True
+VBS00459-4651STDY7016699,True
+VBS00461-4651STDY7016702,True
+VBS00463-4651STDY7016703,True
+VBS00464-4651STDY7016704,True
+VBS00466-4651STDY7016705,True
+VBS00467-4651STDY7016706,True
+VBS00468-4651STDY7016707,True
+VBS00469-4651STDY7016708,True
+VBS00470-4651STDY7016709,True
+VBS00471-4651STDY7016710,True
+VBS00472-4651STDY7016711,True
+VBS00473-4651STDY7016712,True
+VBS00474-4651STDY7016713,True
+VBS00475-4651STDY7016714,True
+VBS00476-4651STDY7016715,True
+VBS00477-4651STDY7016716,True
+VBS00478-4651STDY7016717,True
+VBS00479-4651STDY7016718,True
+VBS00480-4651STDY7016719,True
+VBS00481-4651STDY7016720,True
+VBS00482-4651STDY7016721,True
+VBS00483-4651STDY7016722,True
+VBS00484-4651STDY7016723,True
+VBS00485-4651STDY7016724,True
+VBS00486-4651STDY7016725,True
+VBS00487-4651STDY7016726,True
+VBS00488-4651STDY7016727,True
+VBS00489-4651STDY7016728,True
+VBS00490-4651STDY7016729,True
+VBS00491-4651STDY7016730,True
+VBS00492-4651STDY7016731,True
+VBS00493-4651STDY7016732,True
+VBS00494-4651STDY7016733,True
+VBS00496-4651STDY7016734,True
+VBS00497-4651STDY7016735,True
+VBS00498-4651STDY7016736,True
+VBS00499-4651STDY7016737,True
+VBS00500-4651STDY7016738,True
+VBS00501-4651STDY7016739,True
+VBS00502-4651STDY7016740,True
+VBS00503-4651STDY7016741,True
+VBS00504-4651STDY7016742,True
+VBS00505-4651STDY7016743,True
+VBS00506-4651STDY7016744,True
+VBS00507-4651STDY7016745,True
+VBS00508-4651STDY7016746,True
+VBS00509-4651STDY7016747,True
+VBS00510-4651STDY7016748,True
+VBS00512-4651STDY7016749,True
+VBS00513-4651STDY7016750,True
+VBS00514-4651STDY7016751,True
+VBS00515-4651STDY7016752,True
+VBS00516-4651STDY7016753,True
+VBS00517-4651STDY7016754,True
+VBS00518-4651STDY7016755,True
+VBS00519-4651STDY7016756,True
+VBS00521-4651STDY7016757,True
+VBS00522-4651STDY7016758,True
+VBS00523-4651STDY7016759,True
+VBS00524-4651STDY7016760,True
+VBS00525-4651STDY7016761,True
+VBS00526-4651STDY7016762,True
+VBS00527-4651STDY7016763,True
+VBS00528-4651STDY7016764,True
+VBS00529-4651STDY7016765,True
+VBS00530-4651STDY7016766,True
+VBS00531-4651STDY7016767,True
+VBS00532-4651STDY7016768,True
+VBS00533-4651STDY7016769,True
+VBS00534-4651STDY7016770,True
+VBS00535-4651STDY7016771,True
+VBS00536-4651STDY7016772,True
+VBS00538-4651STDY7016773,True
+VBS00539-4651STDY7016774,True
+VBS00540-4651STDY7016775,True
+VBS00542-4651STDY7016776,True
+VBS00544-4651STDY7016777,True
+VBS00545-4651STDY7016778,True
+VBS00546-4651STDY7016779,True
+VBS00547-4651STDY7016780,True
+VBS00548-4651STDY7016781,True
+VBS00549-4651STDY7016782,True
+VBS00551-4651STDY7016783,True
+VBS00555-4651STDY7016784,True
+VBS00559-4651STDY7016785,True
+VBS00560-4651STDY7016786,True
+VBS00561-4651STDY7016787,True
+VBS00562-4651STDY7016788,True
+VBS00564-4651STDY7016789,True
+VBS00565-4651STDY7016790,True
+VBS00567-4651STDY7016791,True
+VBS00579-4651STDY7016792,True
+VBS00580-4651STDY7016793,True
+VBS00584-4651STDY7016794,True
+VBS00589-4651STDY7016795,True
+VBS00593-4651STDY7016798,True
+VBS00597-4651STDY7016799,True
+VBS00598-4651STDY7016800,True
+VBS00599-4651STDY7016801,True
+VBS00601-4651STDY7016802,True
+VBS00602-4651STDY7016803,True
+VBS00603-4651STDY7016804,True
+VBS00605-4651STDY7016805,True
+VBS00609-4651STDY7016806,True
+VBS00611-4651STDY7016807,True
+VBS00612-4651STDY7016808,True
+VBS00616-4651STDY7016809,True
+VBS00618-4651STDY7016810,True
+VBS00620-4651STDY7016811,True
+VBS00621-4651STDY7016812,True
+VBS00623-4651STDY7016813,True
+VBS00624-4651STDY7016814,True
+VBS00625-4651STDY7016815,True
+VBS00626-4651STDY7016816,True
+VBS00627-4651STDY7016817,True
+VBS00630-4651STDY7016818,True
+VBS00631-4651STDY7016819,True
+VBS00632-4651STDY7016820,True
+VBS00634-4651STDY7016821,True
+VBS00635-4651STDY7016822,True
+VBS00636-4651STDY7016823,True
+VBS00637-4651STDY7016824,True
+VBS00638-4651STDY7016825,True
+VBS00639-4651STDY7016826,True
+VBS00640-4651STDY7016827,True
+VBS00641-4651STDY7016828,True
+VBS00642-4651STDY7016829,True
+VBS00643-4651STDY7016830,True
+VBS00646-4651STDY7016831,True
+VBS00648-4651STDY7016832,True
+VBS00652-4651STDY7016833,True
+VBS00654-4651STDY7016834,True
+VBS00655-4651STDY7016835,True
+VBS00657-4651STDY7016836,True
+VBS00658-4651STDY7016837,True
+VBS00659-4651STDY7016838,True
+VBS00660-4651STDY7016839,True
+VBS00661-4651STDY7016840,True
+VBS00662-4651STDY7016841,True
+VBS00663-4651STDY7016842,True
+VBS00664-4651STDY7016843,True
+VBS00667-4651STDY7016844,True
+VBS00668-4651STDY7016845,True
+VBS00673-4651STDY7016846,True
+VBS00674-4651STDY7016847,True
+VBS00678-4651STDY7016848,True
+VBS00679-4651STDY7016849,True
+VBS00680-4651STDY7016850,True
+VBS00681-4651STDY7016851,True
+VBS00682-4651STDY7016852,True
+VBS00683-4651STDY7016853,True
+VBS00686-4651STDY7016854,True
+VBS00689-4651STDY7016855,True
+VBS00690-4651STDY7016856,True
+VBS00691-4651STDY7016857,True
+VBS00692-4651STDY7016858,True
+VBS00695-4651STDY7016859,True
+VBS00696-4651STDY7016860,True
+VBS00697-4651STDY7016861,True
+VBS00702-4651STDY7016862,True
+VBS00704-4651STDY7016863,True
+VBS00710-4651STDY7016864,True
+VBS00711-4651STDY7016865,True
+VBS00713-4651STDY7016866,True
+VBS00715-4651STDY7016867,True
+VBS00718-4651STDY7016868,True
+VBS00719-4651STDY7016869,True
+VBS00720-4651STDY7016870,True
+VBS00721-4651STDY7016871,True
+VBS00723-4651STDY7016872,True
+VBS00724-4651STDY7016873,True
+VBS00726-4651STDY7016874,True
+VBS00729-4651STDY7016875,True
+VBS00730-4651STDY7016876,True
+VBS00731-4651STDY7016877,True
+VBS00739-4651STDY7016878,True
+VBS00741-4651STDY7016879,True
+VBS00743-4651STDY7016880,True
+VBS00747-4651STDY7016881,True
+VBS00750-4651STDY7016882,True
+VBS00751-4651STDY7016883,True
+VBS00752-4651STDY7016884,True
+VBS00756-4651STDY7016885,True
+VBS00763-4651STDY7016886,True
+VBS00764-4651STDY7016887,True
+VBS00767-4651STDY7016888,True
+VBS00771-4651STDY7016889,True
+VBS00772-4651STDY7016890,True
+VBS00776-4651STDY7016891,True
+VBS00777-4651STDY7016894,True
+VBS00778-4651STDY7016895,True
+VBS00786-4651STDY7016896,True
+VBS00801-4651STDY7016897,True
+VBS00802-4651STDY7016898,True
+VBS00805-4651STDY7016899,True
+VBS00806-4651STDY7016900,True
+VBS00807-4651STDY7016901,True
+VBS00810-4651STDY7016902,True
+VBS00814-4651STDY7016903,True
+VBS00820-4651STDY7016904,True
+VBS00822-4651STDY7016905,True
+VBS00823-4651STDY7016906,True
+VBS00824-4651STDY7016907,True
+VBS00826-4651STDY7016908,True
+VBS00828-4651STDY7016909,True
+VBS00829-4651STDY7016910,True
+VBS00830-4651STDY7016911,True
+VBS00832-4651STDY7016912,True
+VBS00833-4651STDY7016913,True
+VBS00841-4651STDY7016914,True
+VBS00842-4651STDY7016915,True
+VBS00843-4651STDY7016916,True
+VBS00844-4651STDY7016917,True
+VBS00847-4651STDY7016918,True
+VBS00848-4651STDY7016919,True
+VBS00849-4651STDY7016920,True
+VBS00853-4651STDY7016921,True
+VBS00854-4651STDY7016922,True
+VBS00856-4651STDY7016923,True
+VBS00857-4651STDY7016924,True
+VBS00859-4651STDY7016925,True
+VBS00860-4651STDY7016926,True
+VBS00861-4651STDY7016927,True
+VBS00863-4651STDY7016928,True
+VBS00865-4651STDY7016929,True
+VBS00866-4651STDY7016930,True
+VBS00871-4651STDY7016931,True
+VBS00874-4651STDY7016932,True
+VBS00876-4651STDY7016933,True
+VBS00878-4651STDY7016934,True
+VBS00879-4651STDY7016935,True
+VBS00882-4651STDY7016936,True
+VBS00883-4651STDY7016937,True
+VBS00884-4651STDY7016938,True
+VBS00890-4651STDY7016939,True
+VBS00891-4651STDY7016940,True
+VBS00893-4651STDY7016941,True
+VBS00894-4651STDY7016942,True
+VBS00895-4651STDY7016943,True
+VBS00899-4651STDY7016944,True
+VBS00900-4651STDY7016945,True
+VBS00901-4651STDY7016946,True
+VBS00902-4651STDY7016947,True
+VBS00903-4651STDY7016948,True
+VBS00905-4651STDY7016949,True
+VBS00906-4651STDY7016950,True
+VBS00907-4651STDY7016951,True
+VBS00908-4651STDY7016952,True
+VBS00909-4651STDY7016953,True
+VBS00910-4651STDY7016954,True
+VBS00911-4651STDY7016955,True
+VBS00914-4651STDY7016956,True
+VBS00915-4651STDY7016957,True
+VBS00921-4651STDY7016958,True
+VBS00923-4651STDY7016959,True
+VBS00924-4651STDY7016960,True
+VBS00930-4651STDY7016961,True
+VBS00934-4651STDY7016962,True
+VBS00998-4651STDY7016963,True
+VBS01017-4651STDY7016964,True
+VBS01043-4651STDY7016965,True
+VBS01097-4651STDY7016966,True
+VBS01119-4651STDY7016967,True
+VBS01193-4651STDY7016968,True
+VBS01198-4651STDY7016969,True
+VBS01410-4651STDY7016970,True
+VBS01417-4651STDY7016971,True
+VBS01437-4651STDY7016972,True
+VBS01453-4651STDY7016973,True
+VBS01497-4651STDY7016974,True
+VBS01500-4651STDY7016975,True
+VBS01515-4651STDY7016976,True
+VBS01529-4651STDY7016977,True
+VBS01531-4651STDY7016978,True
+VBS01539-4651STDY7016979,True
+VBS01545-4651STDY7016980,True
+VBS01550-4651STDY7016981,True
+VBS01555-4651STDY7016982,True
+VBS01559-4651STDY7016983,True
+VBS01560-4651STDY7016984,True
+VBS01561-4651STDY7016985,True
+VBS01563-4651STDY7016986,True
+VBS01567-4651STDY7016987,True
+VBS01573-4651STDY7016990,True
+VBS01577-4651STDY7016991,True
+VBS01581-4651STDY7016992,True
+VBS01582-4651STDY7016993,True
+VBS01585-4651STDY7016994,True
+VBS01588-4651STDY7016995,True
+VBS01589-4651STDY7016996,True
+VBS01598-4651STDY7016997,True
+VBS01599-4651STDY7016998,True
+VBS01600-4651STDY7016999,True
+VBS01601-4651STDY7017000,True
+VBS01602-4651STDY7017001,True
+VBS01604-4651STDY7017002,True
+VBS01605-4651STDY7017003,True
+VBS01607-4651STDY7017004,True
+VBS01609-4651STDY7017005,True
+VBS01613-4651STDY7017006,True
+VBS01614-4651STDY7017007,True
+VBS01616-4651STDY7017008,True
+VBS01617-4651STDY7017009,True
+VBS01618-4651STDY7017010,True
+VBS01621-4651STDY7017011,True
+VBS01622-4651STDY7017012,True
+VBS01623-4651STDY7017013,True
+VBS01625-4651STDY7017014,True
+VBS01629-4651STDY7017015,True
+VBS01630-4651STDY7017016,True
+VBS01632-4651STDY7017017,True
+VBS01633-4651STDY7017018,True
+VBS01634-4651STDY7017019,True
+VBS01641-4651STDY7017020,True
+VBS01644-4651STDY7017021,True
+VBS01645-4651STDY7017022,True
+VBS01648-4651STDY7017023,True
+VBS01649-4651STDY7017024,True
+VBS01652-4651STDY7017025,True
+VBS01653-4651STDY7017026,True
+VBS01654-4651STDY7017027,True
+VBS01655-4651STDY7017028,True
+VBS01657-4651STDY7017029,True
+VBS01661-4651STDY7017030,True
+VBS01662-4651STDY7017031,True
+VBS01663-4651STDY7017032,True
+VBS01665-4651STDY7017033,True
+VBS01666-4651STDY7017034,True
+VBS01667-4651STDY7017035,True
+VBS01668-4651STDY7017036,True
+VBS01671-4651STDY7017037,True
+VBS01675-4651STDY7017038,True
+VBS01676-4651STDY7017039,True
+VBS01677-4651STDY7017040,True
+VBS01683-4651STDY7017041,True
+VBS01684-4651STDY7017042,True
+VBS01685-4651STDY7017043,True
+VBS01686-4651STDY7017044,True
+VBS01687-4651STDY7017045,True
+VBS01691-4651STDY7017046,True
+VBS01692-4651STDY7017047,True
+VBS01694-4651STDY7017048,True
+VBS01699-4651STDY7017049,True
+VBS01700-4651STDY7017050,True
+VBS01703-4651STDY7017051,True
+VBS01706-4651STDY7017052,True
+VBS01707-4651STDY7017053,True
+VBS01711-4651STDY7017054,True
+VBS01716-4651STDY7017055,True
+VBS01717-4651STDY7017056,True
+VBS01718-4651STDY7017057,True
+VBS01719-4651STDY7017058,True
+VBS01720-4651STDY7017059,True
+VBS01725-4651STDY7017060,True
+VBS01731-4651STDY7017061,True
+VBS01732-4651STDY7017062,True
+VBS01733-4651STDY7017063,True
+VBS01738-4651STDY7017064,True
+VBS01739-4651STDY7017065,True
+VBS01741-4651STDY7017066,True
+VBS01748-4651STDY7017067,True
+VBS01765-4651STDY7017068,True
+VBS01800-4651STDY7017069,True
+VBS01837-4651STDY7017070,True
+VBS01845-4651STDY7017071,True
+VBS01854-4651STDY7017072,True
+VBS01855-4651STDY7017073,True
+VBS01856-4651STDY7017074,True
+VBS01857-4651STDY7017075,True
+VBS01859-4651STDY7017076,True
+VBS01863-4651STDY7017077,True
+VBS01867-4651STDY7017078,True
+VBS01872-4651STDY7017079,True
+VBS01875-4651STDY7017080,True
+VBS01881-4651STDY7017081,True
+VBS01883-4651STDY7017082,True
+VBS01891-4651STDY7017083,True
+VBS01894-4651STDY7017086,True
+VBS01895-4651STDY7017087,True
+VBS01923-4651STDY7017088,True
+VBS01936-4651STDY7017089,True
+VBS01969-4651STDY7017090,True
+VBS01977-4651STDY7017091,True
+VBS01993-4651STDY7017092,True
diff --git a/tests/anoph/fixture/vo_agam_release_master_us_central1/v3/metadata/general/AG1000G-AO/surveillance.flags.csv b/tests/anoph/fixture/vo_agam_release_master_us_central1/v3/metadata/general/AG1000G-AO/surveillance.flags.csv
new file mode 100644
index 000000000..6ea981f6e
--- /dev/null
+++ b/tests/anoph/fixture/vo_agam_release_master_us_central1/v3/metadata/general/AG1000G-AO/surveillance.flags.csv
@@ -0,0 +1,135 @@
+sample_id,is_surveillance
+AR0001-C,True
+AR0002-C,True
+AR0003-C,True
+AR0004-C,True
+AR0004-Cx,True
+AR0005-C,True
+AR0006-C,True
+AR0007-C,True
+AR0008-C,True
+AR0008-Cx,True
+AR0009-C,True
+AR0010-C,True
+AR0010-Cx,True
+AR0011-C,True
+AR0012-C,True
+AR0013-C,True
+AR0013-Cx,True
+AR0014-C,True
+AR0014-Cx,True
+AR0015-C,True
+AR0016-C,True
+AR0016-Cx,True
+AR0017-C,True
+AR0018-C,True
+AR0018-Cx,True
+AR0019-C,True
+AR0020-C,True
+AR0020-Cx,True
+AR0021-C,True
+AR0021-Cx,True
+AR0022-C,True
+AR0023-C,True
+AR0023-Cx,True
+AR0024-C,True
+AR0024-Cx,True
+AR0025-C,True
+AR0026-C,True
+AR0027-C,True
+AR0027-Cx,True
+AR0028-C,True
+AR0029-C,True
+AR0030-C,True
+AR0031-C,True
+AR0032-C,True
+AR0033-C,True
+AR0034-C,True
+AR0035-C,True
+AR0036-C,True
+AR0037-C,True
+AR0038-C,True
+AR0038-Cx,True
+AR0039-C,True
+AR0040-C,True
+AR0041-C,True
+AR0042-C,True
+AR0043-C,True
+AR0044-C,True
+AR0045-C,True
+AR0046-C,True
+AR0047-C,True
+AR0048-C,True
+AR0049-C,True
+AR0050-C,True
+AR0050-Cx,True
+AR0051-C,True
+AR0052-C,True
+AR0053-C,True
+AR0054-C,True
+AR0054-Cx,True
+AR0055-C,True
+AR0056-C,True
+AR0057-C,True
+AR0058-C,True
+AR0058-Cx,True
+AR0059-C,True
+AR0060-C,True
+AR0061-C,True
+AR0062-C,True
+AR0063-C,True
+AR0064-C,True
+AR0065-C,True
+AR0066-C,True
+AR0066-Cx,True
+AR0067-C,True
+AR0067-Cx,True
+AR0068-C,True
+AR0068-Cx,True
+AR0069-C,True
+AR0069-Cx,True
+AR0070-C,True
+AR0070-Cx,True
+AR0071-C,True
+AR0071-Cx,True
+AR0072-C,True
+AR0073-C,True
+AR0073-Cx,True
+AR0074-C,True
+AR0075-C,True
+AR0075-Cx,True
+AR0076-C,True
+AR0077-C,True
+AR0078-C,True
+AR0079-C,True
+AR0079-Cx,True
+AR0080-C,True
+AR0081-C,True
+AR0081-Cx,True
+AR0082-C,True
+AR0082-Cx,True
+AR0083-C,True
+AR0083-Cx,True
+AR0084-C,True
+AR0085-C,True
+AR0085-Cx,True
+AR0086-C,True
+AR0087-C,True
+AR0088-C,True
+AR0088-Cx,True
+AR0089-C,True
+AR0089-Cx,True
+AR0090-C,True
+AR0090-Cx,True
+AR0091-C,True
+AR0092-C,True
+AR0093-C,True
+AR0093-Cx,True
+AR0094-C,True
+AR0095-C,True
+AR0096-C,True
+AR0096-Cx,True
+AR0097-C,True
+AR0098-C,True
+AR0099-C,True
+AR0100-C,True
diff --git a/tests/anoph/fixture/vo_agam_release_master_us_central1/v3/metadata/general/AG1000G-BF-A/surveillance.flags.csv b/tests/anoph/fixture/vo_agam_release_master_us_central1/v3/metadata/general/AG1000G-BF-A/surveillance.flags.csv
new file mode 100644
index 000000000..8df83bc7b
--- /dev/null
+++ b/tests/anoph/fixture/vo_agam_release_master_us_central1/v3/metadata/general/AG1000G-BF-A/surveillance.flags.csv
@@ -0,0 +1,334 @@
+sample_id,is_surveillance
+AB0085-C,True
+AB0085-Cx,True
+AB0086-C,True
+AB0086-Cx,True
+AB0087-C,True
+AB0087-Cx,True
+AB0088-C,True
+AB0089-C,True
+AB0089-Cx,True
+AB0090-C,True
+AB0090-Cx,True
+AB0091-C,True
+AB0091-Cx,True
+AB0092-C,True
+AB0092-Cx,True
+AB0093-C,True
+AB0094-C,True
+AB0094-Cx,True
+AB0095-C,True
+AB0095-Cx,True
+AB0096-C,True
+AB0097-C,True
+AB0097-Cx,True
+AB0098-C,True
+AB0098-Cx,True
+AB0099-C,True
+AB0099-Cx,True
+AB0100-C,True
+AB0100-Cx,True
+AB0101-C,True
+AB0102-C,True
+AB0103-C,True
+AB0104-C,True
+AB0104-Cx,True
+AB0105-C,True
+AB0105-Cx,True
+AB0106-C,True
+AB0106-Cx,True
+AB0107-C,True
+AB0107-Cx,True
+AB0108-C,True
+AB0108-Cx,True
+AB0109-C,True
+AB0110-C,True
+AB0110-Cx,True
+AB0111-C,True
+AB0111-Cx,True
+AB0112-C,True
+AB0112-Cx,True
+AB0113-C,True
+AB0114-C,True
+AB0114-Cx,True
+AB0115-C,True
+AB0116-C,True
+AB0116-Cx,True
+AB0117-C,True
+AB0117-Cx,True
+AB0118-C,True
+AB0118-Cx,True
+AB0119-C,True
+AB0119-Cx,True
+AB0120-C,True
+AB0121-C,True
+AB0121-Cx,True
+AB0122-C,True
+AB0122-Cx,True
+AB0123-C,True
+AB0123-Cx,True
+AB0124-C,True
+AB0124-Cx,True
+AB0125-C,True
+AB0126-C,True
+AB0126-Cx,True
+AB0127-C,True
+AB0127-Cx,True
+AB0128-C,True
+AB0128-Cx,True
+AB0129-C,True
+AB0129-Cx,True
+AB0130-C,True
+AB0130-Cx,True
+AB0131-C,True
+AB0131-Cx,True
+AB0132-C,True
+AB0132-CW,True
+AB0133-C,True
+AB0133-Cx,True
+AB0134-C,True
+AB0134-Cx,True
+AB0135-C,True
+AB0135-Cx,True
+AB0136-C,True
+AB0136-Cx,True
+AB0137-C,True
+AB0137-Cx,True
+AB0138-C,True
+AB0138-Cx,True
+AB0139-C,True
+AB0139-Cx,True
+AB0140-C,True
+AB0141-C,True
+AB0141-Cx,True
+AB0142-C,True
+AB0142-Cx,True
+AB0143-C,True
+AB0143-Cx,True
+AB0144-C,True
+AB0144-Cx,True
+AB0145-C,True
+AB0145-Cx,True
+AB0146-C,True
+AB0146-Cx,True
+AB0147-C,True
+AB0148-C,True
+AB0148-Cx,True
+AB0149-C,True
+AB0150-C,True
+AB0150-Cx,True
+AB0151-C,True
+AB0151-Cx,True
+AB0152-C,True
+AB0153-C,True
+AB0154-C,True
+AB0154-Cx,True
+AB0155-C,True
+AB0155-Cx,True
+AB0156-C,True
+AB0156-Cx,True
+AB0157-C,True
+AB0157-Cx,True
+AB0158-C,True
+AB0158-Cx,True
+AB0159-C,True
+AB0159-Cx,True
+AB0160-C,True
+AB0160-Cx,True
+AB0161-C,True
+AB0162-C,True
+AB0163-C,True
+AB0164-C,True
+AB0164-Cx,True
+AB0165-C,True
+AB0166-C,True
+AB0166-Cx,True
+AB0167-C,True
+AB0168-C,True
+AB0169-C,True
+AB0169-Cx,True
+AB0170-C,True
+AB0170-Cx,True
+AB0171-C,True
+AB0171-Cx,True
+AB0172-C,True
+AB0172-Cx,True
+AB0173-C,True
+AB0173-Cx,True
+AB0174-C,True
+AB0174-Cx,True
+AB0175-C,True
+AB0175-Cx,True
+AB0176-C,True
+AB0176-Cx,True
+AB0177-C,True
+AB0177-Cx,True
+AB0178-C,True
+AB0178-Cx,True
+AB0179-C,True
+AB0179-Cx,True
+AB0180-C,True
+AB0180-Cx,True
+AB0181-C,True
+AB0182-C,True
+AB0182-Cx,True
+AB0183-C,True
+AB0184-C,True
+AB0185-C,True
+AB0185-Cx,True
+AB0186-C,True
+AB0187-C,True
+AB0187-Cx,True
+AB0188-C,True
+AB0188-Cx,True
+AB0189-C,True
+AB0190-C,True
+AB0191-C,True
+AB0191-Cx,True
+AB0192-C,True
+AB0192-Cx,True
+AB0193-C,True
+AB0193-Cx,True
+AB0194-C,True
+AB0194-CW,True
+AB0195-C,True
+AB0195-Cx,True
+AB0196-C,True
+AB0196-Cx,True
+AB0197-C,True
+AB0197-Cx,True
+AB0198-C,True
+AB0199-C,True
+AB0199-Cx,True
+AB0200-C,True
+AB0200-Cx,True
+AB0201-C,True
+AB0201-Cx,True
+AB0202-C,True
+AB0202-Cx,True
+AB0203-C,True
+AB0203-Cx,True
+AB0204-C,True
+AB0205-C,True
+AB0206-C,True
+AB0206-CW,True
+AB0207-C,True
+AB0208-C,True
+AB0209-C,True
+AB0209-Cx,True
+AB0210-C,True
+AB0210-CW,True
+AB0211-C,True
+AB0212-C,True
+AB0212-Cx,True
+AB0213-C,True
+AB0213-CW,True
+AB0214-C,True
+AB0214-CW,True
+AB0215-C,True
+AB0216-C,True
+AB0216-CW,True
+AB0217-C,True
+AB0217-CW,True
+AB0218-C,True
+AB0219-C,True
+AB0219-Cx,True
+AB0220-C,True
+AB0220-CW,True
+AB0221-C,True
+AB0222-C,True
+AB0222-Cx,True
+AB0223-C,True
+AB0223-Cx,True
+AB0224-C,True
+AB0225-C,True
+AB0226-C,True
+AB0227-C,True
+AB0227-Cx,True
+AB0228-C,True
+AB0228-Cx,True
+AB0229-C,True
+AB0229-Cx,True
+AB0230-C,True
+AB0230-Cx,True
+AB0231-C,True
+AB0232-C,True
+AB0232-Cx,True
+AB0233-C,True
+AB0233-Cx,True
+AB0234-C,True
+AB0235-C,True
+AB0235-Cx,True
+AB0236-C,True
+AB0236-Cx,True
+AB0237-C,True
+AB0238-C,True
+AB0239-C,True
+AB0239-Cx,True
+AB0240-C,True
+AB0240-Cx,True
+AB0241-C,True
+AB0241-Cx,True
+AB0242-C,True
+AB0243-C,True
+AB0244-C,True
+AB0245-C,True
+AB0246-C,True
+AB0247-C,True
+AB0248-C,True
+AB0249-C,True
+AB0249-Cx,True
+AB0250-C,True
+AB0250-Cx,True
+AB0251-C,True
+AB0251-Cx,True
+AB0252-C,True
+AB0253-C,True
+AB0254-C,True
+AB0255-C,True
+AB0256-C,True
+AB0256-Cx,True
+AB0257-C,True
+AB0257-Cx,True
+AB0258-C,True
+AB0258-Cx,True
+AB0259-C,True
+AB0259-Cx,True
+AB0260-C,True
+AB0261-C,True
+AB0261-Cx,True
+AB0262-C,True
+AB0262-Cx,True
+AB0263-C,True
+AB0263-Cx,True
+AB0264-C,True
+AB0264-Cx,True
+AB0265-C,True
+AB0266-C,True
+AB0266-Cx,True
+AB0267-C,True
+AB0268-C,True
+AB0269-C,True
+AB0269-Cx,True
+AB0270-C,True
+AB0271-C,True
+AB0271-Cx,True
+AB0272-C,True
+AB0272-Cx,True
+AB0273-C,True
+AB0273-Cx,True
+AB0274-C,True
+AB0275-C,True
+AB0276-C,True
+AB0277-C,True
+AB0278-C,True
+AB0279-C,True
+AB0279-Cx,True
+AB0280-C,True
+AB0280-Cx,True
+AB0281-C,True
+AB0281-Cx,True
+AB0282-C,True
+AB0282-Cx,True
+AB0283-C,True
+AB0284-C,True
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
index 080b32816..6feae55ab 100644
--- a/tests/anoph/test_sample_metadata.py
+++ b/tests/anoph/test_sample_metadata.py
@@ -647,13 +647,19 @@ def sample_metadata_expected_columns(
has_aims, has_cohorts_by_quarter, has_sequence_qc, ordered_contigs
):
expected_columns = general_metadata_expected_columns()
+
if has_sequence_qc:
expected_columns.update(sequence_qc_metadata_expected_columns(ordered_contigs))
+
+ expected_columns.update({"is_surveillance": "b"})
+
if has_aims:
expected_columns.update(aim_metadata_expected_columns())
+
expected_columns.update(
cohorts_metadata_expected_columns(has_cohorts_by_quarter=has_cohorts_by_quarter)
)
+
return expected_columns
@@ -829,8 +835,9 @@ def test_sample_metadata_quarter(fixture, api: AnophelesSampleMetadata):
def test_sample_metadata_with_missing_file(
missing_metadata_api: AnophelesSampleMetadata,
):
- # In this test, one of the sample sets (AG1000G-BF-A) has a missing file.
+ # In this test, there is missing metadata.
# We expect this to be filled with empty values.
+ # We also expect warnings for missing surveillance flags.
api = missing_metadata_api
# Set up test.
@@ -840,7 +847,15 @@ def test_sample_metadata_with_missing_file(
for sample_set in all_sample_sets:
# Call function to be tested.
- df = api.sample_metadata(sample_sets=sample_set)
+ with pytest.warns(UserWarning) as captured_warnings:
+ df = api.sample_metadata(sample_sets=sample_set)
+
+ # Check expected warnings.
+ expected_message = f"WARNING: The surveillance flags data is missing for sample set {sample_set}"
+ assert all(
+ str(captured_warning.message) == expected_message
+ for captured_warning in captured_warnings
+ )
# Check output.
validate_metadata(
From 19902b01a8f5171754c3a1c999e2ec61ff501d45 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 20 Mar 2025 16:00:59 +0000
Subject: [PATCH 05/32] WIP: add _prep_sample_query_param() stub where
_prep_sample_set_param()
---
malariagen_data/anoph/aim_data.py | 10 +++++++---
malariagen_data/anoph/base.py | 10 ++++++++++
malariagen_data/anoph/cnv_data.py | 2 ++
malariagen_data/anoph/dipclust.py | 6 +++++-
malariagen_data/anoph/fst.py | 4 ++--
malariagen_data/anoph/g123.py | 4 ++--
malariagen_data/anoph/h12.py | 4 ++--
malariagen_data/anoph/h1x.py | 4 ++--
malariagen_data/anoph/hap_data.py | 8 +++++---
malariagen_data/anoph/hapclust.py | 6 +++++-
malariagen_data/anoph/sample_metadata.py | 21 ++++++++++++---------
malariagen_data/anoph/snp_data.py | 14 ++++++++++----
malariagen_data/anopheles.py | 6 +++---
13 files changed, 67 insertions(+), 32 deletions(-)
diff --git a/malariagen_data/anoph/aim_data.py b/malariagen_data/anoph/aim_data.py
index ba98aa125..0e89a93b8 100644
--- a/malariagen_data/anoph/aim_data.py
+++ b/malariagen_data/anoph/aim_data.py
@@ -142,6 +142,8 @@ def aim_calls(
aims = self._prep_aims_param(aims=aims)
sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
del sample_sets
+ sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
+ del sample_query
# Access SNP calls and concatenate multiple sample sets and/or regions.
ly = []
@@ -156,12 +158,14 @@ def aim_calls(
ds = simple_xarray_concat(ly, dim=DIM_SAMPLE)
# Handle sample query.
- if sample_query is not None:
+ if sample_query_prepped is not None:
df_samples = self.sample_metadata(sample_sets=sample_sets_prepped)
sample_query_options = sample_query_options or {}
- loc_samples = df_samples.eval(sample_query, **sample_query_options).values
+ loc_samples = df_samples.eval(
+ sample_query_prepped, **sample_query_options
+ ).values
if np.count_nonzero(loc_samples) == 0:
- raise ValueError(f"No samples found for query {sample_query!r}")
+ raise ValueError(f"No samples found for query {sample_query_prepped!r}")
ds = ds.isel(samples=loc_samples)
return ds
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index a68f70e5a..f38df799b 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -574,6 +574,16 @@ def _prep_sample_sets_param(
return prepped_sample_sets
+ def _prep_sample_query_param(
+ self, *, sample_query: Optional[base_params.sample_query]
+ ) -> str:
+ """Common handling for the `sample_query` parameter."""
+
+ # FIXME: WIP
+ prepped_sample_query = sample_query
+
+ return prepped_sample_query
+
def _results_cache_add_analysis_params(self, params: dict):
# Expect sub-classes will override to add any analysis parameters.
pass
diff --git a/malariagen_data/anoph/cnv_data.py b/malariagen_data/anoph/cnv_data.py
index 23eeef33b..66eefa965 100644
--- a/malariagen_data/anoph/cnv_data.py
+++ b/malariagen_data/anoph/cnv_data.py
@@ -198,6 +198,7 @@ def cnv_hmm(
debug("normalise parameters")
sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ sample_query = self._prep_sample_query_param(sample_query=sample_query)
regions: List[Region] = parse_multi_region(self, region)
del region
@@ -589,6 +590,7 @@ def cnv_discordant_read_calls(
debug("normalise parameters")
sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ sample_query = self._prep_sample_query_param(sample_query=sample_query)
if isinstance(contig, str):
contig = [contig]
diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py
index 9f10debda..aee330a78 100644
--- a/malariagen_data/anoph/dipclust.py
+++ b/malariagen_data/anoph/dipclust.py
@@ -216,12 +216,16 @@ def diplotype_pairwise_distances(
# Normalize params for consistent hash value.
sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
+ del sample_sets
+ sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
+ del sample_query
region_prepped = self._prep_region_cache_param(region=region)
+ del region
params = dict(
region=region_prepped,
site_mask=site_mask,
sample_sets=sample_sets_prepped,
- sample_query=sample_query,
+ sample_query=sample_query_prepped,
sample_query_options=sample_query_options,
site_class=site_class,
cohort_size=cohort_size,
diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py
index 086d81824..b4175fe19 100644
--- a/malariagen_data/anoph/fst.py
+++ b/malariagen_data/anoph/fst.py
@@ -131,8 +131,8 @@ def fst_gwss(
params = dict(
contig=contig,
window_size=window_size,
- cohort1_query=cohort1_query,
- cohort2_query=cohort2_query,
+ cohort1_query=self._prep_sample_query_param(sample_query=cohort1_query),
+ cohort2_query=self._prep_sample_query_param(sample_query=cohort2_query),
sample_query_options=sample_query_options,
sample_sets=self._prep_sample_sets_param(sample_sets=sample_sets),
site_mask=self._prep_optional_site_mask_param(site_mask=site_mask),
diff --git a/malariagen_data/anoph/g123.py b/malariagen_data/anoph/g123.py
index dca996aa0..24796e0ba 100644
--- a/malariagen_data/anoph/g123.py
+++ b/malariagen_data/anoph/g123.py
@@ -190,7 +190,7 @@ def g123_gwss(
# N.B., do not be tempted to convert this sample query into integer
# indices using _prep_sample_selection_params, because the indices
# are different in the haplotype data.
- sample_query=sample_query,
+ sample_query=self._prep_sample_query_param(sample_query=sample_query),
sample_query_options=sample_query_options,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
@@ -288,7 +288,7 @@ def g123_calibration(
# N.B., do not be tempted to convert this sample query into integer
# indices using _prep_sample_selection_params, because the indices
# are different in the haplotype data.
- sample_query=sample_query,
+ sample_query=self._prep_sample_query_param(sample_query=sample_query),
sample_query_options=sample_query_options,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
diff --git a/malariagen_data/anoph/h12.py b/malariagen_data/anoph/h12.py
index 3e211eeb3..f39746614 100644
--- a/malariagen_data/anoph/h12.py
+++ b/malariagen_data/anoph/h12.py
@@ -102,7 +102,7 @@ def h12_calibration(
# N.B., do not be tempted to convert this sample query into integer
# indices using _prep_sample_selection_params, because the indices
# are different in the haplotype data.
- sample_query=sample_query,
+ sample_query=self._prep_sample_query_param(sample_query=sample_query),
sample_query_options=sample_query_options,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
@@ -318,7 +318,7 @@ def h12_gwss(
# N.B., do not be tempted to convert this sample query into integer
# indices using _prep_sample_selection_params, because the indices
# are different in the haplotype data.
- sample_query=sample_query,
+ sample_query=self._prep_sample_query_param(sample_query=sample_query),
sample_query_options=sample_query_options,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
diff --git a/malariagen_data/anoph/h1x.py b/malariagen_data/anoph/h1x.py
index 79e5b1ab7..eb335bcf8 100644
--- a/malariagen_data/anoph/h1x.py
+++ b/malariagen_data/anoph/h1x.py
@@ -139,8 +139,8 @@ def h1x_gwss(
# N.B., do not be tempted to convert these sample queries into integer
# indices using _prep_sample_selection_params, because the indices
# are different in the haplotype data.
- cohort1_query=cohort1_query,
- cohort2_query=cohort2_query,
+ cohort1_query=self._prep_sample_query_param(sample_query=cohort1_query),
+ cohort2_query=self._prep_sample_query_param(sample_query=cohort2_query),
sample_query_options=sample_query_options,
sample_sets=self._prep_sample_sets_param(sample_sets=sample_sets),
cohort_size=cohort_size,
diff --git a/malariagen_data/anoph/hap_data.py b/malariagen_data/anoph/hap_data.py
index 10f637720..0e8470010 100644
--- a/malariagen_data/anoph/hap_data.py
+++ b/malariagen_data/anoph/hap_data.py
@@ -350,6 +350,8 @@ def haplotypes(
# Normalise parameters.
sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
del sample_sets
+ sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
+ del sample_query
regions: List[Region] = parse_multi_region(self, region)
del region
analysis = self._prep_phasing_analysis_param(analysis=analysis)
@@ -392,7 +394,7 @@ def haplotypes(
ds = simple_xarray_concat(lx, dim=DIM_VARIANT)
# Handle sample query.
- if sample_query is not None:
+ if sample_query_prepped is not None:
# Load sample metadata.
df_samples = self.sample_metadata(sample_sets=sample_sets_prepped)
@@ -405,12 +407,12 @@ def haplotypes(
# Apply the query.
sample_query_options = sample_query_options or {}
loc_samples = df_samples_phased.eval(
- sample_query, **sample_query_options
+ sample_query_prepped, **sample_query_options
).values
if np.count_nonzero(loc_samples) == 0:
# Bail out, no samples matching the query.
raise ValueError(
- f"No samples found for phasing analysis {analysis!r} and query {sample_query!r}"
+ f"No samples found for phasing analysis {analysis!r} and query {sample_query_prepped!r}"
)
ds = ds.isel(samples=loc_samples)
diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index 05eb6acb8..2c4efa5f3 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -212,12 +212,16 @@ def haplotype_pairwise_distances(
# Normalize params for consistent hash value.
sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
+ del sample_sets
+ sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
+ del sample_query
region_prepped = self._prep_region_cache_param(region=region)
+ del region
params = dict(
region=region_prepped,
analysis=analysis,
sample_sets=sample_sets_prepped,
- sample_query=sample_query,
+ sample_query=sample_query_prepped,
sample_query_options=sample_query_options,
cohort_size=cohort_size,
random_seed=random_seed,
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 804ddc9e0..7b3329904 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -654,9 +654,11 @@ def sample_metadata(
)
# Normalise parameters.
- prepped_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
del sample_sets
- cache_key = tuple(prepped_sample_sets)
+ sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
+ del sample_query
+ cache_key = tuple(sample_sets_prepped)
try:
# Attempt to retrieve from the cache.
@@ -668,12 +670,12 @@ def sample_metadata(
# Get the general sample metadata.
# Note: this includes study and terms-of-use info.
- df_samples = self.general_metadata(sample_sets=prepped_sample_sets)
+ df_samples = self.general_metadata(sample_sets=sample_sets_prepped)
# Merge with the sequence QC metadata.
# Note: merging can change column dtypes, e.g. due to new NaNs.
df_sequence_qc = self.sequence_qc_metadata(
- sample_sets=prepped_sample_sets
+ sample_sets=sample_sets_prepped
)
df_samples = df_samples.merge(
df_sequence_qc, on="sample_id", sort=False, how="left"
@@ -682,7 +684,7 @@ def sample_metadata(
# Merge with the surveillance flags.
# Note: merging can change column dtypes, e.g. due to new NaNs.
df_surveillance_flags = self.surveillance_flags(
- sample_sets=prepped_sample_sets
+ sample_sets=sample_sets_prepped
)
df_samples = df_samples.merge(
df_surveillance_flags, on="sample_id", sort=False, how="left"
@@ -690,14 +692,14 @@ def sample_metadata(
# If available, merge with the AIM metadata.
if self._aim_analysis:
- df_aim = self.aim_metadata(sample_sets=prepped_sample_sets)
+ df_aim = self.aim_metadata(sample_sets=sample_sets_prepped)
df_samples = df_samples.merge(
df_aim, on="sample_id", sort=False, how="left"
)
# If available, merge with the cohorts metadata.
if self._cohorts_analysis:
- df_cohorts = self.cohorts_metadata(sample_sets=prepped_sample_sets)
+ df_cohorts = self.cohorts_metadata(sample_sets=sample_sets_prepped)
df_samples = df_samples.merge(
df_cohorts, on="sample_id", sort=False, how="left"
)
@@ -714,10 +716,10 @@ def sample_metadata(
df_samples = df_samples.merge(data, how="left", on=on)
# Apply the sample_query or sample_indices, if specified.
- if sample_query is not None:
+ if sample_query_prepped is not None:
# Assume a pandas query string.
sample_query_options = sample_query_options or {}
- df_samples = df_samples.query(sample_query, **sample_query_options)
+ df_samples = df_samples.query(sample_query_prepped, **sample_query_options)
df_samples = df_samples.reset_index(drop=True)
elif sample_indices is not None:
# Assume it is an indexer.
@@ -977,6 +979,7 @@ def _prep_sample_selection_cache_params(
) -> Tuple[List[str], Optional[List[int]]]:
# Normalise sample sets.
sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ sample_query = self._prep_sample_query_param(sample_query=sample_query)
if sample_query is not None:
# Resolve query to a list of integers for more cache hits - we
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
index a18c65da1..7b01ee7b4 100644
--- a/malariagen_data/anoph/snp_data.py
+++ b/malariagen_data/anoph/snp_data.py
@@ -464,6 +464,8 @@ def snp_genotypes(
# Normalise parameters.
sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
del sample_sets
+ sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
+ del sample_query
regions: List[Region] = parse_multi_region(self, region)
del region
site_mask_prepped = self._prep_optional_site_mask_param(site_mask=site_mask)
@@ -514,12 +516,14 @@ def snp_genotypes(
d = da_compress(loc_sites, d, axis=0)
# Apply sample selection if requested.
- if sample_query is not None:
+ if sample_query_prepped is not None:
df_samples = self.sample_metadata(sample_sets=sample_sets_prepped)
sample_query_options = sample_query_options or {}
- loc_samples = df_samples.eval(sample_query, **sample_query_options).values
+ loc_samples = df_samples.eval(
+ sample_query_prepped, **sample_query_options
+ ).values
if np.count_nonzero(loc_samples) == 0:
- raise ValueError(f"No samples found for query {sample_query!r}")
+ raise ValueError(f"No samples found for query {sample_query_prepped!r}")
d = da.compress(loc_samples, d, axis=1)
elif sample_indices is not None:
d = da.take(d, sample_indices, axis=1)
@@ -1007,6 +1011,8 @@ def snp_calls(
self._prep_sample_sets_param(sample_sets=sample_sets)
)
del sample_sets
+ sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
+ del sample_query
if sample_indices is not None:
sample_indices_prepped: Optional[Tuple[int, ...]] = tuple(sample_indices)
else:
@@ -1020,7 +1026,7 @@ def snp_calls(
return self._snp_calls(
regions=regions,
sample_sets=sample_sets_prepped,
- sample_query=sample_query,
+ sample_query=sample_query_prepped,
sample_query_options=sample_query_options,
sample_indices=sample_indices_prepped,
site_mask=site_mask_prepped,
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 970d178a3..800df5b2f 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -1330,7 +1330,7 @@ def ihs_gwss(
# N.B., do not be tempted to convert this sample query into integer
# indices using _prep_sample_selection_params, because the indices
# are different in the haplotype data.
- sample_query=sample_query,
+ sample_query=self._prep_sample_query_param(sample_query=sample_query),
sample_query_options=sample_query_options,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
@@ -1856,8 +1856,8 @@ def xpehh_gwss(
# N.B., do not be tempted to convert this sample query into integer
# indices using _prep_sample_selection_params, because the indices
# are different in the haplotype data.
- cohort1_query=cohort1_query,
- cohort2_query=cohort2_query,
+ cohort1_query=self._prep_sample_query_param(sample_query=cohort1_query),
+ cohort2_query=self._prep_sample_query_param(sample_query=cohort2_query),
sample_query_options=sample_query_options,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
From d7b83834f8db432dd12277394efd49554b6ffdb6 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 21 Mar 2025 10:38:55 +0000
Subject: [PATCH 06/32] Add logic to _prep_sample_query_param() to honour
self._surveillance_use_only
---
malariagen_data/anoph/base.py | 10 ++++++++--
malariagen_data/anoph/sample_metadata.py | 1 +
2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index f38df799b..fd9385100 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -579,8 +579,14 @@ def _prep_sample_query_param(
) -> str:
"""Common handling for the `sample_query` parameter."""
- # FIXME: WIP
- prepped_sample_query = sample_query
+ # If self._surveillance_use_only, then add "is_surveillance == True"
+ if self._surveillance_use_only:
+ if sample_query is None or sample_query.strip() == "":
+ prepped_sample_query = "is_surveillance == True"
+ else:
+ prepped_sample_query = f"{sample_query} and is_surveillance == True"
+ else:
+ prepped_sample_query = sample_query
return prepped_sample_query
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 7b3329904..d1138e6ef 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -705,6 +705,7 @@ def sample_metadata(
)
# If surveillance_use_only, restrict to samples with is_surveillance.
+ # Note: this will also be enforced via self._prep_sample_query_param().
if "is_surveillance" in df_samples.columns and self._surveillance_use_only:
df_samples = df_samples[df_samples["is_surveillance"].astype(bool)]
From 435e8a7b7c02bc4aa7fb4b0fdbc0985efffaaba3 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 21 Mar 2025 16:37:34 +0000
Subject: [PATCH 07/32] Allow _prep_sample_query_param() to return None
---
malariagen_data/anoph/base.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index fd9385100..628814498 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -576,7 +576,7 @@ def _prep_sample_sets_param(
def _prep_sample_query_param(
self, *, sample_query: Optional[base_params.sample_query]
- ) -> str:
+ ) -> Optional[str]:
"""Common handling for the `sample_query` parameter."""
# If self._surveillance_use_only, then add "is_surveillance == True"
From bde3d4e9c4c49feda08deae3ea6949be1cf5f680 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 21 Mar 2025 16:57:06 +0000
Subject: [PATCH 08/32] Return consistent data type from
_prep_sample_query_param()
---
malariagen_data/anoph/base.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index 628814498..84316f964 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -576,17 +576,18 @@ def _prep_sample_sets_param(
def _prep_sample_query_param(
self, *, sample_query: Optional[base_params.sample_query]
- ) -> Optional[str]:
+ ) -> Optional[base_params.sample_query]:
"""Common handling for the `sample_query` parameter."""
+ # Return the same data type and default to the original value.
+ prepped_sample_query: Optional[base_params.sample_query] = sample_query
+
# If self._surveillance_use_only, then add "is_surveillance == True"
if self._surveillance_use_only:
if sample_query is None or sample_query.strip() == "":
prepped_sample_query = "is_surveillance == True"
else:
prepped_sample_query = f"{sample_query} and is_surveillance == True"
- else:
- prepped_sample_query = sample_query
return prepped_sample_query
From 50b3f5ce0eda76abf260971fc4d33598b30a6e69 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 24 Apr 2025 12:30:28 +0100
Subject: [PATCH 09/32] Add new public_url param to sample_metadata tests
---
tests/anoph/test_sample_metadata.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
index 05ed7365a..7536a927e 100644
--- a/tests/anoph/test_sample_metadata.py
+++ b/tests/anoph/test_sample_metadata.py
@@ -41,6 +41,7 @@ def ag3_sim_api(ag3_sim_fixture):
def ag3_sim_unrestricted_use_only_api(ag3_sim_fixture):
return AnophelesSampleMetadata(
url=ag3_sim_fixture.url,
+ public_url=ag3_sim_fixture.url,
config_path=_ag3.CONFIG_PATH,
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
major_version_path=_ag3.MAJOR_VERSION_PATH,
@@ -63,6 +64,7 @@ def ag3_sim_unrestricted_use_only_api(ag3_sim_fixture):
def ag3_sim_surveillance_use_only_api(ag3_sim_fixture):
return AnophelesSampleMetadata(
url=ag3_sim_fixture.url,
+ public_url=ag3_sim_fixture.url,
config_path=_ag3.CONFIG_PATH,
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
major_version_path=_ag3.MAJOR_VERSION_PATH,
@@ -85,6 +87,7 @@ def ag3_sim_surveillance_use_only_api(ag3_sim_fixture):
def ag3_sim_unrestricted_surveillance_use_only_api(ag3_sim_fixture):
return AnophelesSampleMetadata(
url=ag3_sim_fixture.url,
+ public_url=ag3_sim_fixture.url,
config_path=_ag3.CONFIG_PATH,
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
major_version_path=_ag3.MAJOR_VERSION_PATH,
@@ -121,6 +124,7 @@ def af1_sim_api(af1_sim_fixture):
def af1_sim_unrestricted_use_only_api(af1_sim_fixture):
return AnophelesSampleMetadata(
url=af1_sim_fixture.url,
+ public_url=af1_sim_fixture.url,
config_path=_af1.CONFIG_PATH,
major_version_number=_af1.MAJOR_VERSION_NUMBER,
major_version_path=_af1.MAJOR_VERSION_PATH,
@@ -134,6 +138,7 @@ def af1_sim_unrestricted_use_only_api(af1_sim_fixture):
def af1_sim_surveillance_use_only_api(af1_sim_fixture):
return AnophelesSampleMetadata(
url=af1_sim_fixture.url,
+ public_url=af1_sim_fixture.url,
config_path=_af1.CONFIG_PATH,
major_version_number=_af1.MAJOR_VERSION_NUMBER,
major_version_path=_af1.MAJOR_VERSION_PATH,
@@ -147,6 +152,7 @@ def af1_sim_surveillance_use_only_api(af1_sim_fixture):
def af1_sim_unrestricted_surveillance_use_only_api(af1_sim_fixture):
return AnophelesSampleMetadata(
url=af1_sim_fixture.url,
+ public_url=af1_sim_fixture.url,
config_path=_af1.CONFIG_PATH,
major_version_number=_af1.MAJOR_VERSION_NUMBER,
major_version_path=_af1.MAJOR_VERSION_PATH,
From fdebfd45111eb35814e04a59e2c7414299c8e075 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 23 May 2025 12:33:40 +0100
Subject: [PATCH 10/32] WIP: dev support for unrestricted_use_only,
surveillance_use_only params
---
malariagen_data/af1.py | 23 +-
malariagen_data/ag3.py | 55 ++-
malariagen_data/anoph/aim_data.py | 65 +++-
malariagen_data/anoph/base.py | 427 ++++++++++++++++++++---
malariagen_data/anoph/cnv_data.py | 56 ++-
malariagen_data/anoph/sample_metadata.py | 94 +++--
tests/anoph/test_sample_metadata.py | 10 +-
7 files changed, 594 insertions(+), 136 deletions(-)
diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py
index 79a566484..cf5b613c6 100644
--- a/malariagen_data/af1.py
+++ b/malariagen_data/af1.py
@@ -136,15 +136,16 @@ def __init__(
def __repr__(self):
text = (
f"\n"
- f"Storage URL : {self._url}\n"
- f"Data releases available : {', '.join(self.releases)}\n"
- f"Results cache : {self._results_cache}\n"
- f"Cohorts analysis : {self._cohorts_analysis}\n"
- f"Site filters analysis : {self._site_filters_analysis}\n"
- f"Software version : malariagen_data {malariagen_data.__version__}\n"
- f"Client location : {self.client_location}\n"
+ f"Storage URL : {self._url}\n"
+ f"Data releases available : {', '.join(self._available_releases)}\n"
+ f"Results cache : {self._results_cache}\n"
+ f"Cohorts analysis : {self._cohorts_analysis}\n"
+ f"Site filters analysis : {self._site_filters_analysis}\n"
+ f"Software version : malariagen_data {malariagen_data.__version__}\n"
+ f"Client location : {self.client_location}\n"
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
+ f"Relevant data releases : {', '.join(self.releases)}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
@@ -178,7 +179,7 @@ def _repr_html_(self):
Data releases available
|
- {', '.join(self.releases)} |
+ {', '.join(self._available_releases)} |
@@ -222,6 +223,12 @@ def _repr_html_(self):
|
{self._surveillance_use_only} |
+
+
+ Relevant data releases
+ |
+ {', '.join(self.releases)} |
+
"""
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
index f1d7f88e2..ad9363c2d 100644
--- a/malariagen_data/ag3.py
+++ b/malariagen_data/ag3.py
@@ -210,23 +210,24 @@ def v3_wild(self):
3.0 release, excluding the lab crosses."""
return [
x
- for x in self.sample_sets(release="3.0")["sample_set"].tolist()
+ for x in self._available_sample_sets(release="3.0")["sample_set"].tolist()
if x != "AG1000G-X"
]
def __repr__(self):
text = (
f"\n"
- f"Storage URL : {self._url}\n"
- f"Data releases available : {', '.join(self.releases)}\n"
- f"Results cache : {self._results_cache}\n"
- f"Cohorts analysis : {self._cohorts_analysis}\n"
- f"AIM analysis : {self._aim_analysis}\n"
- f"Site filters analysis : {self._site_filters_analysis}\n"
- f"Software version : malariagen_data {malariagen_data.__version__}\n"
- f"Client location : {self.client_location}\n"
+ f"Storage URL : {self._url}\n"
+ f"Data releases available : {', '.join(self._available_releases)}\n"
+ f"Results cache : {self._results_cache}\n"
+ f"Cohorts analysis : {self._cohorts_analysis}\n"
+ f"AIM analysis : {self._aim_analysis}\n"
+ f"Site filters analysis : {self._site_filters_analysis}\n"
+ f"Software version : malariagen_data {malariagen_data.__version__}\n"
+ f"Client location : {self.client_location}\n"
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
+ f"Relevant data releases : {', '.join(self.releases)}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
@@ -260,7 +261,7 @@ def _repr_html_(self):
Data releases available
|
- {', '.join(self.releases)} |
+ {', '.join(self._available_releases)} |
@@ -310,6 +311,12 @@ def _repr_html_(self):
|
{self._surveillance_use_only} |
+
+
+ Relevant data releases
+ |
+ {', '.join(self.releases)} |
+
"""
@@ -357,6 +364,34 @@ def cross_metadata(self):
debug("drop 'phenotype' column, not used")
df.drop("phenotype", axis="columns", inplace=True)
+ # Identify the crosses sample set.
+ # Note: this sample set identifier is also hard-coded in `v3_wild()`.
+ crosses_sample_set = "AG1000G-X"
+
+ # If `_unrestricted_use_only` is `True`, then only return data if the crosses sample set has `unrestricted_use` set to `True`.
+ if (
+ self._unrestricted_use_only
+ and not self._sample_set_has_unrestricted_use(
+ sample_set=crosses_sample_set
+ )
+ ):
+ # Remove all the data from the DataFrame and reset its index.
+ df = df.iloc[0:0].reset_index(drop=True)
+
+ # If `_surveillance_use_only` is `True`, then only return samples that have `is_surveillance` set to `True`.
+ if self._surveillance_use_only:
+ crosses_surveillance_flags_df = self._surveillance_flags(
+ sample_sets=[crosses_sample_set]
+ )
+ df = df.merge(
+ crosses_surveillance_flags_df[["sample_id", "is_surveillance"]],
+ on="sample_id",
+ how="left",
+ )
+ df = df[df["is_surveillance"]]
+ df = df.drop(columns=["is_surveillance"])
+
+ # Cache the cross metadata.
self._cache_cross_metadata = df
return self._cache_cross_metadata.copy()
diff --git a/malariagen_data/anoph/aim_data.py b/malariagen_data/anoph/aim_data.py
index 0e89a93b8..46f671e88 100644
--- a/malariagen_data/anoph/aim_data.py
+++ b/malariagen_data/anoph/aim_data.py
@@ -138,35 +138,64 @@ def aim_calls(
) -> xr.Dataset:
self._require_aim_analysis()
- # Normalise parameters.
- aims = self._prep_aims_param(aims=aims)
- sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
+ # Prepare parameters.
+ prepared_aims = self._prep_aims_param(aims=aims)
+ del aims
+ prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
del sample_sets
- sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
+ prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
del sample_query
- # Access SNP calls and concatenate multiple sample sets and/or regions.
- ly = []
- for s in sample_sets_prepped:
- y = self._aim_calls_dataset(
- aims=aims,
- sample_set=s,
+ # Start a list of AIM calls Datasets, one for each sample set.
+ aim_calls_datasets = []
+
+ # For each sample set...
+ for sample_set in prepared_sample_sets:
+ # Get the AIM calls for all samples in the set, as a Xarray Dataset.
+ aim_calls_dataset = self._aim_calls_dataset(
+ aims=prepared_aims,
+ sample_set=sample_set,
)
- ly.append(y)
+
+ # Add this Dataset to the list.
+ aim_calls_datasets.append(aim_calls_dataset)
# Concatenate data from multiple sample sets.
- ds = simple_xarray_concat(ly, dim=DIM_SAMPLE)
+ ds = simple_xarray_concat(aim_calls_datasets, dim=DIM_SAMPLE)
- # Handle sample query.
- if sample_query_prepped is not None:
- df_samples = self.sample_metadata(sample_sets=sample_sets_prepped)
+ # If there's a sample query...
+ if prepared_sample_query is not None:
+ # Get the relevant sample metadata.
+ df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)
+
+ # If there are no sample query options, then default to an empty dict.
sample_query_options = sample_query_options or {}
+
+ # Determine which samples match the sample query.
loc_samples = df_samples.eval(
- sample_query_prepped, **sample_query_options
+ prepared_sample_query, **sample_query_options
).values
+
+ # Raise an error if no samples match the sample query.
if np.count_nonzero(loc_samples) == 0:
- raise ValueError(f"No samples found for query {sample_query_prepped!r}")
- ds = ds.isel(samples=loc_samples)
+ raise ValueError(
+ f"No samples found for query {prepared_sample_query!r}"
+ )
+
+ # Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
+ relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values
+
+ # Get all the sample ids from the unfiltered AIM calls Dataset.
+ ds_sample_ids = ds.coords["sample_id"].values
+
+ # Get the indices of samples in the AIM calls Dataset that match the relevant sample ids.
+ # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
+ relevant_sample_indices = np.where(
+ np.isin(ds_sample_ids, relevant_sample_ids)
+ )[0]
+
+ # Select only the relevant samples from the AIM calls Dataset.
+ ds = ds.isel(samples=relevant_sample_indices)
return ds
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index e00662da5..b17bc6603 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -172,7 +172,9 @@ def __init__(
# Set up cache attributes.
self._cache_releases: Optional[Tuple[str, ...]] = None
+ self._cache_available_releases: Optional[Tuple[str, ...]] = None
self._cache_sample_sets: Dict[str, pd.DataFrame] = dict()
+ self._cache_available_sample_sets: Dict[str, pd.DataFrame] = dict()
self._cache_sample_set_to_release: Optional[Dict[str, str]] = None
self._cache_sample_set_to_study: Optional[Dict[str, str]] = None
self._cache_sample_set_to_study_info: Optional[Dict[str, dict]] = None
@@ -351,19 +353,133 @@ def _discover_releases(self) -> Tuple[str, ...]:
return discovered_releases
@property
- def releases(self) -> Tuple[str, ...]:
- """Currently available data releases."""
- if self._cache_releases is None:
+ def _available_releases(self) -> Tuple[str, ...]:
+ """Currently available data releases, regardless of `unrestricted_use_only` and `surveillance_use_only`. When `pre` is set to `True`, this includes "pre-releases", otherwise only the "public" releases."""
+ if self._cache_available_releases is None:
if self._pre:
- self._cache_releases = self._discover_releases()
+ self._cache_available_releases = self._discover_releases()
else:
- self._cache_releases = self._public_releases()
+ self._cache_available_releases = self._public_releases()
+
+ return self._cache_available_releases
+
+ @property
+ def _releases_with_unrestricted_data(self) -> Tuple[str, ...]:
+ """Releases that contain some unrestricted data."""
+
+ # Start a list of releases that contain some unrestricted data.
+ releases_with_unrestricted_data = []
+
+ # Get the available releases, which depends on the `pre` setting.
+ available_releases = self._available_releases
+
+ # For each available release...
+ for release in available_releases:
+ # Determine whether this release contains any unrestricted data.
+ if self._release_has_unrestricted_data(release=release):
+ releases_with_unrestricted_data.append(release)
+
+ return tuple(releases_with_unrestricted_data)
+
+ @property
+ def _releases_with_surveillance_data(self) -> Tuple[str, ...]:
+ """Releases that contain some surveillance data."""
+
+ # Start a list of releases that contain some surveillance data.
+ releases_with_surveillance_data = []
+
+ # Get the available releases, which will depend on the `pre` setting.
+ available_releases = self._available_releases
+
+ # For each available release...
+ for release in available_releases:
+ # Determine whether this release contains any surveillance data.
+ if self._release_has_surveillance_data(release=release):
+ releases_with_surveillance_data.append(release)
+
+ return tuple(releases_with_surveillance_data)
+
+ @property
+ def _relevant_releases(self) -> Tuple[str, ...]:
+ """Relevant data releases. When `unrestricted_use_only` is set to `True`, only releases that contain some unrestricted data will be included. When `surveillance_use_only` is set to true, only releases that contain some surveillance data will be included. When `pre` is set to `True`, this includes "pre-releases", otherwise only the "public" releases."""
+
+ if self._cache_releases is None:
+ # Start a list of the relevant releases.
+ relevant_releases = []
+
+ # Get the available releases, which depends on the `pre` setting.
+ available_releases = self._available_releases
+
+ # If there are no criteria, then all available releases are relevant.
+ if not self._unrestricted_use_only and not self._surveillance_use_only:
+ relevant_releases = available_releases
+
+ elif self._unrestricted_use_only and not self._surveillance_use_only:
+ # Get the releases with unrestricted data.
+ releases_with_unrestricted_data = self._releases_with_unrestricted_data
+
+ # Determine whether each release is relevant to the specified criteria.
+ for release in available_releases:
+ # Determine whether this release has any unrestricted data.
+ has_unrestricted_data = release in releases_with_unrestricted_data
+
+ # If we want unrestricted data, but this release doesn't have any, then don't include it.
+ if self._unrestricted_use_only and not has_unrestricted_data:
+ continue
+
+ # Otherwise, this release is relevant, so include it.
+ relevant_releases.append(release)
+
+ elif not self._unrestricted_use_only and self._surveillance_use_only:
+ # Get the releases with surveillance data.
+ releases_with_surveillance_data = self._releases_with_surveillance_data
+
+ # Determine whether each release is relevant to the specified criteria.
+ for release in available_releases:
+ # Determine whether this release has any surveillance data.
+ has_surveillance_data = release in releases_with_surveillance_data
+
+ # If we want surveillance data, but this release doesn't have any, then don't include it.
+ if self._surveillance_use_only and not has_surveillance_data:
+ continue
+
+ # Otherwise, this release is relevant, so include it.
+ relevant_releases.append(release)
+
+ elif self._unrestricted_use_only and self._surveillance_use_only:
+ # Get the releases with unrestricted data.
+ releases_with_unrestricted_data = self._releases_with_unrestricted_data
+
+ # Get the releases with surveillance data.
+ releases_with_surveillance_data = self._releases_with_surveillance_data
+
+ # Determine whether each release is relevant to the specified criteria.
+ for release in available_releases:
+ # Determine whether this release has any unrestricted data.
+ has_unrestricted_data = release in releases_with_unrestricted_data
+
+ # Determine whether this release has any surveillance data.
+ has_surveillance_data = release in releases_with_surveillance_data
+
+ # If we want unrestricted data, but this release doesn't have any, then don't include it.
+ if self._unrestricted_use_only and not has_unrestricted_data:
+ continue
+
+ # If we want surveillance data, but this release doesn't have any, then don't include it.
+ if self._surveillance_use_only and not has_surveillance_data:
+ continue
+
+ # Otherwise, this release is relevant, so include it.
+ relevant_releases.append(release)
+
+ self._cache_releases = tuple(relevant_releases)
+
return self._cache_releases
@property
- def relevant_releases(self) -> Tuple[str, ...]:
- """Relevant data releases. When `unrestricted_use_only` is set to `True`, this excludes releases that contain restricted sample sets."""
- return tuple(r for r in self.releases if not self.sample_sets(release=r).empty)
+ def releases(self) -> Tuple[str, ...]:
+ """Relevant data releases. When `unrestricted_use_only` is set to `True`, only releases that contain some unrestricted data will be included. When `surveillance_use_only` is set to true, only releases that contain some surveillance data will be included. When `pre` is set to `True`, this includes "pre-releases", otherwise only the "public" releases."""
+ return self._relevant_releases
@property
def client_location(self) -> str:
@@ -378,7 +494,90 @@ def client_location(self) -> str:
location = "unknown"
return location
- def _read_sample_sets(self, *, single_release: str):
+ def _release_has_unrestricted_data(self, *, release: str):
+ """Return `True` if the specified release has any unrestricted data. Otherwise return `False`."""
+
+ # The release has unrestricted data if any of its sample sets are marked as unrestricted.
+ # `_read_sample_sets_manifest` gives the sample sets manifest for a release as a DataFrame, potentially with an `unrestricted_use` column.
+
+ # Get the sample sets manifest for the specified release, potentially with the derived `unrestricted_use` column.
+ sample_sets_manifest_df = self._read_sample_sets_manifest(
+ single_release=release
+ )
+
+ # Determine whether any of the sample sets in the manifest are marked as unrestricted.
+ release_has_unrestricted_data = (
+ "unrestricted_use" in sample_sets_manifest_df.columns
+ and sample_sets_manifest_df["unrestricted_use"].any()
+ )
+
+ return release_has_unrestricted_data
+
+ def _release_has_surveillance_data(self, *, release: str):
+ """Return `True` if the specified release has any surveillance data. Otherwise return `False`."""
+
+ # The release has surveillance data if any of its sample sets have any samples that are flagged as `is_surveillance`.
+
+ # Get the list of sample sets for the specified release.
+ # Note: rather than using `sample_sets()`, to avoid additional processing, we are using `_read_sample_sets_manifest()`.
+ sample_sets_manifest_df = self._read_sample_sets_manifest(
+ single_release=release
+ )
+ sample_sets = sample_sets_manifest_df["sample_set"].to_list()
+
+ # Determine whether any of the sample sets have surveillance data.
+ # Note: rather than using `_surveillance_flags`, to avoid unnecessary processing, we only need to find one sample set.
+ release_has_surveillance_data = False
+ for sample_set in sample_sets:
+ if self._sample_set_has_surveillance_data(sample_set=sample_set):
+ release_has_surveillance_data = True
+ break
+
+ return release_has_surveillance_data
+
+ def _sample_set_has_surveillance_data(self, *, sample_set: str):
+ """Return `True` if the specified sample set has any surveillance data. Otherwise return `False`."""
+
+ # Get the surveillance flags for this sample set.
+ sample_set_surveillance_flags_df = self._surveillance_flags(
+ sample_sets=[sample_set]
+ )
+
+ # Determine whether there are any samples in this sample set with `is_surveillance` set to `True`.
+ sample_set_has_surveillance_data = (
+ "is_surveillance" in sample_set_surveillance_flags_df.columns
+ and sample_set_surveillance_flags_df["is_surveillance"].any()
+ )
+
+ return sample_set_has_surveillance_data
+
+ def _sample_set_has_unrestricted_use(self, *, sample_set: str):
+ """Return `True` if the specified sample set has any unrestricted use. Otherwise return `False`."""
+
+ # Get the manifest data for this sample set.
+ sample_set_release = self.lookup_release(sample_set)
+ release_manifest_df = self._read_sample_sets_manifest(
+ single_release=sample_set_release
+ )
+ sample_set_records_srs = release_manifest_df.loc[
+ release_manifest_df["sample_set"] == sample_set, "unrestricted_use"
+ ]
+
+ if len(sample_set_records_srs) == 0:
+ raise ValueError(
+ f"No release manifest info found for sample_set '{sample_set}'"
+ )
+ elif len(sample_set_records_srs) > 1:
+ raise ValueError(
+ f"More than one record found in the release manifest for sample_set '{sample_set}'"
+ )
+ else:
+ # Convert the NumPy boolean to a standard Python bool.
+ sample_set_has_unrestricted_use = bool(sample_set_records_srs.iloc[0])
+
+ return sample_set_has_unrestricted_use
+
+ def _read_sample_sets_manifest(self, *, single_release: str):
"""Read the manifest of sample sets for a single release."""
# Construct a path for the manifest file.
release_path = self._release_to_path(single_release)
@@ -417,35 +616,135 @@ def _read_sample_sets(self, *, single_release: str):
`term_of_use_expiry` is the date when the terms of use expire,
`terms_of_use_url` is the URL of the terms of use,
`release` is the identifier of the release containing the sample set,
- `unrestricted_use` whether the sample set can be without restriction (e.g., if the terms of use have expired).
- If `unrestricted_use_only` is set to `True` then only sample sets with `unrestricted_use` set to `True` will be included.
+ `unrestricted_use_only` whether the sample set can be without restriction (e.g., if the terms of use have expired).
+ If `unrestricted_use_only` was set to `True` then only sample sets with `unrestricted_use` set to `True` will be included.
+ If `surveillance_use_only` was set to `True` then only sample sets that contain one or more samples with `is_surveillance` set to `True` will be included.
""",
)
def sample_sets(
self,
release: Optional[base_params.release] = None,
+ ) -> pd.DataFrame:
+ return self._relevant_sample_sets(release=release)
+
+ @check_types
+ @doc(
+ summary="Access a dataframe of available sample sets",
+ returns="""A dataframe of available sample sets, one row per sample set. It contains five columns:
+ `sample_set` is the name of the sample set,
+ `sample_count` is the number of samples the sample set contains,
+ `study_id` is the identifier for the study that generated the sample set,
+ `study_url` is the URL of the study on the MalariaGEN website,
+ `term_of_use_expiry` is the date when the terms of use expire,
+ `terms_of_use_url` is the URL of the terms of use,
+ `release` is the identifier of the release containing the sample set,
+ `unrestricted_use_only` whether the sample set can be without restriction (e.g., if the terms of use have expired).
+ """,
+ )
+ def _available_sample_sets(
+ self,
+ release: Optional[base_params.release] = None,
) -> pd.DataFrame:
if release is None:
# Retrieve sample sets from all available releases.
- release = self.releases
+ release = self._available_releases
+
+ if isinstance(release, str):
+ # Retrieve sample sets for a single release.
+
+ if release not in self._available_releases:
+ raise ValueError(
+ f"Release is either not relevant or not available: {release!r}"
+ )
+
+ try:
+ df = self._cache_available_sample_sets[release]
+
+ except KeyError:
+ # Read and cache dataframe for performance.
+ df = self._read_sample_sets_manifest(single_release=release)
+ self._cache_available_sample_sets[release] = df
+
+ elif isinstance(release, Sequence):
+ # Ensure no duplicates.
+ releases = sorted(set(release))
+
+ # Retrieve and concatenate sample sets from multiple releases.
+ df = pd.concat(
+ [self._available_sample_sets(release=r) for r in releases],
+ axis=0,
+ ignore_index=True,
+ )
+
+ else:
+ raise TypeError
+
+ # Return copy to ensure cached dataframes aren't modified by user.
+ return df.copy()
+
+ @check_types
+ @doc(
+ summary="Access a dataframe of relevant sample sets",
+ returns="""A dataframe of relevant sample sets, one row per sample set. It contains five columns:
+ `sample_set` is the name of the sample set,
+ `sample_count` is the number of samples the sample set contains,
+ `study_id` is the identifier for the study that generated the sample set,
+ `study_url` is the URL of the study on the MalariaGEN website,
+ `term_of_use_expiry` is the date when the terms of use expire,
+ `terms_of_use_url` is the URL of the terms of use,
+ `release` is the identifier of the release containing the sample set,
+ `unrestricted_use_only` whether the sample set can be without restriction (e.g., if the terms of use have expired).
+ If `unrestricted_use_only` was set to `True` then only sample sets with `unrestricted_use` set to `True` will be included.
+ If `surveillance_use_only` was set to `True` then only sample sets that contain one or more samples with `is_surveillance` set to `True` will be included.
+ """,
+ )
+ def _relevant_sample_sets(
+ self,
+ release: Optional[base_params.release] = None,
+ ) -> pd.DataFrame:
+ # Note: `release` must either be `None` or be one of `_relevant_releases`.
+ # Otherwise this function will raise a `ValueError`.
+
+ if release is None:
+ # Retrieve sample sets from all relevant releases.
+ release = self._relevant_releases
if isinstance(release, str):
# Retrieve sample sets for a single release.
- if release not in self.releases:
- raise ValueError(f"Release not available: {release!r}")
+ if release not in self._relevant_releases:
+ raise ValueError(
+ f"Release is either not relevant or not available: {release!r}"
+ )
try:
df = self._cache_sample_sets[release]
except KeyError:
# Read and cache dataframe for performance.
- df = self._read_sample_sets(single_release=release)
+ df = self._read_sample_sets_manifest(single_release=release)
# If unrestricted_use_only, restrict to sample sets with unrestricted_use.
if "unrestricted_use" in df.columns and self._unrestricted_use_only:
df = df[df["unrestricted_use"].astype(bool)]
+ # If surveillance_use_only, restrict to sample sets that contain one or more `is_surveillance` samples.
+ if self._surveillance_use_only:
+ # Start a list of the relevant sample sets.
+ relevant_sample_sets = []
+
+ # For each of the DataFrame's sample sets...
+ release_sample_sets = df["sample_set"].to_list()
+ for sample_set in release_sample_sets:
+ # Determine whether this sample set has surveillance data.
+ if self._sample_set_has_surveillance_data(
+ sample_set=sample_set
+ ):
+ relevant_sample_sets.append(sample_set)
+
+ # Remove other sample sets from the DataFrame.
+ df = df[df["sample_set"].isin(relevant_sample_sets)]
+
self._cache_sample_sets[release] = df
elif isinstance(release, Sequence):
@@ -454,7 +753,7 @@ def sample_sets(
# Retrieve and concatenate sample sets from multiple releases.
df = pd.concat(
- [self.sample_sets(release=r) for r in releases],
+ [self._relevant_sample_sets(release=r) for r in releases],
axis=0,
ignore_index=True,
)
@@ -472,13 +771,15 @@ def sample_sets(
)
def lookup_release(self, sample_set: base_params.sample_set) -> str:
if self._cache_sample_set_to_release is None:
- df_sample_sets = self.sample_sets().set_index("sample_set")
+ df_sample_sets = self._available_sample_sets().set_index("sample_set")
self._cache_sample_set_to_release = df_sample_sets["release"].to_dict()
try:
return self._cache_sample_set_to_release[sample_set]
except KeyError:
- raise ValueError(f"No release found for sample set {sample_set!r}")
+ raise ValueError(
+ f"No release found for sample set {sample_set!r}. This sample set might be unavailable or irrelevant with respect to settings."
+ )
@check_types
@doc(
@@ -487,7 +788,7 @@ def lookup_release(self, sample_set: base_params.sample_set) -> str:
)
def lookup_study(self, sample_set: base_params.sample_set) -> str:
if self._cache_sample_set_to_study is None:
- df_sample_sets = self.sample_sets().set_index("sample_set")
+ df_sample_sets = self._available_sample_sets().set_index("sample_set")
self._cache_sample_set_to_study = df_sample_sets["study_id"].to_dict()
try:
return self._cache_sample_set_to_study[sample_set]
@@ -501,7 +802,7 @@ def lookup_study(self, sample_set: base_params.sample_set) -> str:
)
def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
if self._cache_sample_set_to_study_info is None:
- df_sample_sets = self.sample_sets().set_index("sample_set")
+ df_sample_sets = self._available_sample_sets().set_index("sample_set")
self._cache_sample_set_to_study_info = df_sample_sets[
["study_id", "study_url"]
].to_dict(orient="index")
@@ -517,7 +818,7 @@ def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
)
def lookup_terms_of_use_info(self, sample_set: base_params.sample_set) -> dict:
if self._cache_sample_set_to_terms_of_use_info is None:
- df_sample_sets = self.sample_sets().set_index("sample_set")
+ df_sample_sets = self._available_sample_sets().set_index("sample_set")
self._cache_sample_set_to_terms_of_use_info = df_sample_sets[
[
"terms_of_use_expiry_date",
@@ -539,43 +840,69 @@ def _prep_sample_sets_param(
allow this to be a single sample set, or a list of sample sets, or a
release identifier, or a list of release identifiers."""
- all_sample_sets = self.sample_sets()["sample_set"].to_list()
+ # Get the relevant sample sets as a list.
+ all_relevant_sample_sets = self._relevant_sample_sets()["sample_set"].to_list()
+ # If no sample sets are specified...
if sample_sets is None:
- # All available sample sets.
- prepped_sample_sets = all_sample_sets
+ # Assume we want all relevant sample sets.
+ prepared_sample_sets = all_relevant_sample_sets
+ # Otherwise, if the sample sets are specified as a string...
elif isinstance(sample_sets, str):
+ # If the given string starts with the release major version number...
if sample_sets.startswith(f"{self._major_version_number}."):
- # Convenience, can use a release identifier to denote all sample sets in a release.
- prepped_sample_sets = self.sample_sets(release=sample_sets)[
+ # Assume the given string is a release.
+ release = str(sample_sets)
+
+ # Get the relevant sample sets for this release as a list.
+ prepared_sample_sets = self._relevant_sample_sets(release=release)[
"sample_set"
].to_list()
else:
- # Single sample set, normalise to always return a list.
- prepped_sample_sets = [sample_sets]
+ # Assume the given string is a single sample set identifier.
+ # Put the single sample set identifier into a list, for consistency.
+ prepared_sample_sets = [sample_sets]
else:
- # Sequence of sample sets or releases.
- assert isinstance(sample_sets, Sequence)
- prepped_sample_sets = []
- for s in sample_sets:
- # Make a recursive call to handle the case where s is a release identifier.
- sp = self._prep_sample_sets_param(sample_sets=s)
+ # Check that the given sample_sets is some kind of Sequence.
+ # Otherwise, raise an error.
+ if not isinstance(sample_sets, Sequence):
+ sample_sets_type = type(sample_sets)
+ raise ValueError(
+ f"Unsupported data type for sample_sets param: {sample_sets_type}"
+ )
+
+ # sample_sets is a kind of Sequence.
+ seq = sample_sets
+
+ # Start a list of prepared sample sets.
+ prepared_sample_sets = []
- # Make sure we end up with a flat list of sample sets.
- prepped_sample_sets.extend(sp)
+ # For each item in the given Sequence...
+ for seq_item in seq:
+ # The item might be a release identifier.
+ # Make a recursive call to reduce release identifiers into a list of sample sets.
+ seq_item_sample_sets = self._prep_sample_sets_param(
+ sample_sets=seq_item
+ )
+
+ # Use `extend` rather than `append`, because we are adding a list to a list.
+ prepared_sample_sets.extend(seq_item_sample_sets)
- # Ensure all sample sets selected at most once.
- prepped_sample_sets = sorted(set(prepped_sample_sets))
+ # Remove duplicates from the list of sample sets and sort it.
+ prepared_sample_sets = sorted(set(prepared_sample_sets))
- # Check for bad sample sets.
- for s in prepped_sample_sets:
- if s not in all_sample_sets:
- raise ValueError(f"Sample set {s!r} not found.")
+ # Check for unavailable or irrelevant sample sets.
+ if set(prepared_sample_sets) != set(all_relevant_sample_sets):
+ for sample_set in prepared_sample_sets:
+ if sample_set not in all_relevant_sample_sets:
+ raise ValueError(
+ f"Sample set {sample_set!r} not found. This sample set might be unavailable or irrelevant with respect to settings."
+ )
- return prepped_sample_sets
+ return prepared_sample_sets
def _prep_sample_query_param(
self, *, sample_query: Optional[base_params.sample_query]
@@ -585,12 +912,20 @@ def _prep_sample_query_param(
# Return the same data type and default to the original value.
prepped_sample_query: Optional[base_params.sample_query] = sample_query
- # If self._surveillance_use_only, then add "is_surveillance == True"
+ # If `_surveillance_use_only` then ensure there is an is_surveillance query criterion.
if self._surveillance_use_only:
+ is_surveillance_query_criterion = "is_surveillance == True"
+ # If there is no query, then set it to the is_surveillance query criterion.
if sample_query is None or sample_query.strip() == "":
- prepped_sample_query = "is_surveillance == True"
+ prepped_sample_query = is_surveillance_query_criterion
else:
- prepped_sample_query = f"{sample_query} and is_surveillance == True"
+ # If the current query already ends with the is_surveillance query criterion, then keep it as it is.
+ if sample_query.endswith(f" and {is_surveillance_query_criterion}"):
+ prepped_sample_query = sample_query
+ else:
+ prepped_sample_query = (
+ f"{sample_query} and {is_surveillance_query_criterion}"
+ )
return prepped_sample_query
diff --git a/malariagen_data/anoph/cnv_data.py b/malariagen_data/anoph/cnv_data.py
index 66eefa965..bfa9733e4 100644
--- a/malariagen_data/anoph/cnv_data.py
+++ b/malariagen_data/anoph/cnv_data.py
@@ -197,8 +197,8 @@ def cnv_hmm(
debug = self._log.debug
debug("normalise parameters")
- sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
- sample_query = self._prep_sample_query_param(sample_query=sample_query)
+ prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
regions: List[Region] = parse_multi_region(self, region)
del region
@@ -207,7 +207,7 @@ def cnv_hmm(
lx = []
for r in regions:
ly = []
- for s in sample_sets:
+ for s in prepared_sample_sets:
y = self._cnv_hmm_dataset(
contig=r.contig,
sample_set=s,
@@ -244,25 +244,40 @@ def cnv_hmm(
ds = simple_xarray_concat(lx, dim=DIM_VARIANT)
debug("handle sample query")
- if sample_query is not None:
- debug("load sample metadata")
- df_samples = self.sample_metadata(sample_sets=sample_sets)
-
- debug("align sample metadata with CNV data")
- cnv_samples = ds["sample_id"].values.tolist()
- df_samples_cnv = (
- df_samples.set_index("sample_id").loc[cnv_samples].reset_index()
- )
- debug("apply the query")
+ # If there's a sample query...
+ if prepared_sample_query is not None:
+ # Get the relevant sample metadata.
+ df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)
+
+ # If there are no sample query options, then default to an empty dict.
sample_query_options = sample_query_options or {}
- loc_query_samples = df_samples_cnv.eval(
- sample_query, **sample_query_options
+
+ # Determine which samples match the sample query.
+ loc_samples = df_samples.eval(
+ prepared_sample_query, **sample_query_options
).values
- if np.count_nonzero(loc_query_samples) == 0:
- raise ValueError(f"No samples found for query {sample_query!r}")
- ds = ds.isel(samples=loc_query_samples)
+ # Raise an error if no samples match the sample query.
+ if np.count_nonzero(loc_samples) == 0:
+ raise ValueError(
+ f"No samples found for query {prepared_sample_query!r}"
+ )
+
+ # Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
+ relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values
+
+ # Get all the sample ids from the unfiltered CNV HMM Dataset.
+ ds_sample_ids = ds.coords["sample_id"].values
+
+ # Get the indices of samples in the CNV HMM Dataset that match the relevant sample ids.
+ # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
+ relevant_sample_indices = np.where(
+ np.isin(ds_sample_ids, relevant_sample_ids)
+ )[0]
+
+ # Select only the relevant samples from the CNV HMM Dataset.
+ ds = ds.isel(samples=relevant_sample_indices)
debug("handle coverage variance filter")
if max_coverage_variance is not None:
@@ -626,6 +641,11 @@ def cnv_discordant_read_calls(
debug("load sample metadata")
df_samples = self.sample_metadata(sample_sets=sample_sets)
+ if df_samples.empty:
+ raise ValueError(
+ f"No samples found for sample sets {sample_sets!r}. These samples might be unavailable or irrelevant with respect to settings."
+ )
+
debug("align sample metadata with CNV data")
cnv_samples = ds["sample_id"].values.tolist()
df_samples_cnv = (
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index d1138e6ef..7574a1f0f 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -94,13 +94,13 @@ def _parse_metadata_paths(
aim_analysis: Optional[str] = None,
cohorts_analysis: Optional[str] = None,
) -> pd.DataFrame:
- # Normalise input parameters.
- sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
- del sample_sets
+ # Warning: don't use `_prep_sample_sets_param` in this function because that can cause a circular dependency, eventually raising a RecursionError.
+ # For instance, `_prep_sample_sets_param` uses `_relevant_sample_sets`, which uses `_surveillance_flags, which uses `_parse_metadata_paths`.
+ # Instead, use `_prep_sample_sets_param` to prepare `sample_sets` before passing it to this function.
# Obtain paths for all files we need to fetch.
file_paths: Mapping[str, str] = self._metadata_paths(
- sample_sets=sample_sets_prepped,
+ sample_sets=sample_sets,
path_template=path_template,
aim_analysis=aim_analysis,
cohorts_analysis=cohorts_analysis,
@@ -114,7 +114,7 @@ def _parse_metadata_paths(
# Parse files into DataFrames.
dfs = []
- for sample_set in sample_sets_prepped:
+ for sample_set in sample_sets:
path = file_paths[sample_set]
data = files[path]
df = parse_metadata_func(sample_set, data)
@@ -183,10 +183,13 @@ def _parse_general_metadata(
def general_metadata(
self, sample_sets: Optional[base_params.sample_sets] = None
) -> pd.DataFrame:
+ prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ del sample_sets
+
return self._parse_metadata_paths(
path_template="{release_path}/metadata/general/{sample_set}/samples.meta.csv",
parse_metadata_func=self._parse_general_metadata,
- sample_sets=sample_sets,
+ sample_sets=prepared_sample_sets,
)
@property
@@ -324,10 +327,13 @@ def _parse_sequence_qc_metadata(
def sequence_qc_metadata(
self, sample_sets: Optional[base_params.sample_sets] = None
) -> pd.DataFrame:
+ prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ del sample_sets
+
return self._parse_metadata_paths(
path_template="{release_path}/metadata/curation/{sample_set}/sequence_qc_stats.csv",
parse_metadata_func=self._parse_sequence_qc_metadata,
- sample_sets=sample_sets,
+ sample_sets=prepared_sample_sets,
)
def _parse_surveillance_flags(
@@ -337,9 +343,11 @@ def _parse_surveillance_flags(
original_warning_filters = warnings.filters[:]
# Specify the expected data type for each column.
+ # Note: "bool" is not nullable and does not support `NaN`, which is required when missing data.
+ # Otherwise `NaN` will be mis-translated to `True` when the dtype is applied to the DataFrame.
dtype = {
"sample_id": "object",
- "is_surveillance": "bool",
+ "is_surveillance": "boolean",
}
if isinstance(data, bytes):
@@ -402,9 +410,11 @@ def _parse_surveillance_flags(
`is_surveillance` indicates whether the sample can be used for surveillance,
""",
)
- def surveillance_flags(
- self, sample_sets: Optional[base_params.sample_sets] = None
- ) -> pd.DataFrame:
+ def _surveillance_flags(self, sample_sets: base_params.sample_sets) -> pd.DataFrame:
+ # Warning: don't use `_prep_sample_sets_param` here, because `_prep_sample_sets_param` uses `_relevant_sample_sets`,
+ # which uses this function, which would cause a RecursionError due to cyclic dependency.
+ # Instead, prepare the `sample_sets` parameter before calling this function.
+
return self._parse_metadata_paths(
path_template="{release_path}/metadata/general/{sample_set}/surveillance.flags.csv",
parse_metadata_func=self._parse_surveillance_flags,
@@ -518,10 +528,13 @@ def cohorts_metadata(
) -> pd.DataFrame:
self._require_cohorts_analysis()
+ prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ del sample_sets
+
return self._parse_metadata_paths(
path_template="{release_path}/metadata/cohorts_{cohorts_analysis}/{sample_set}/samples.cohorts.csv",
parse_metadata_func=self._parse_cohorts_metadata,
- sample_sets=sample_sets,
+ sample_sets=prepared_sample_sets,
cohorts_analysis=self._cohorts_analysis,
)
@@ -579,10 +592,13 @@ def aim_metadata(
) -> pd.DataFrame:
self._require_aim_analysis()
+ prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ del sample_sets
+
return self._parse_metadata_paths(
path_template="{release_path}/metadata/species_calls_aim_{aim_analysis}/{sample_set}/samples.species_aim.csv",
parse_metadata_func=self._parse_aim_metadata,
- sample_sets=sample_sets,
+ sample_sets=prepared_sample_sets,
aim_analysis=self._aim_analysis,
)
@@ -648,18 +664,21 @@ def sample_metadata(
sample_query_options: Optional[base_params.sample_query_options] = None,
sample_indices: Optional[base_params.sample_indices] = None,
) -> pd.DataFrame:
- # Extra parameter checks.
+ # Check that either sample_query xor sample_indices are provided.
base_params.validate_sample_selection_params(
sample_query=sample_query, sample_indices=sample_indices
)
- # Normalise parameters.
- sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
+ # Prepare parameters.
+ prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
+
+ # Delete original parameters to prevent accidental use.
del sample_sets
- sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
del sample_query
- cache_key = tuple(sample_sets_prepped)
+ # Determine the cache key.
+ cache_key = tuple(prepared_sample_sets)
try:
# Attempt to retrieve from the cache.
df_samples = self._cache_sample_metadata[cache_key]
@@ -670,12 +689,12 @@ def sample_metadata(
# Get the general sample metadata.
# Note: this includes study and terms-of-use info.
- df_samples = self.general_metadata(sample_sets=sample_sets_prepped)
+ df_samples = self.general_metadata(sample_sets=prepared_sample_sets)
# Merge with the sequence QC metadata.
# Note: merging can change column dtypes, e.g. due to new NaNs.
df_sequence_qc = self.sequence_qc_metadata(
- sample_sets=sample_sets_prepped
+ sample_sets=prepared_sample_sets
)
df_samples = df_samples.merge(
df_sequence_qc, on="sample_id", sort=False, how="left"
@@ -683,8 +702,8 @@ def sample_metadata(
# Merge with the surveillance flags.
# Note: merging can change column dtypes, e.g. due to new NaNs.
- df_surveillance_flags = self.surveillance_flags(
- sample_sets=sample_sets_prepped
+ df_surveillance_flags = self._surveillance_flags(
+ sample_sets=prepared_sample_sets
)
df_samples = df_samples.merge(
df_surveillance_flags, on="sample_id", sort=False, how="left"
@@ -692,23 +711,18 @@ def sample_metadata(
# If available, merge with the AIM metadata.
if self._aim_analysis:
- df_aim = self.aim_metadata(sample_sets=sample_sets_prepped)
+ df_aim = self.aim_metadata(sample_sets=prepared_sample_sets)
df_samples = df_samples.merge(
df_aim, on="sample_id", sort=False, how="left"
)
# If available, merge with the cohorts metadata.
if self._cohorts_analysis:
- df_cohorts = self.cohorts_metadata(sample_sets=sample_sets_prepped)
+ df_cohorts = self.cohorts_metadata(sample_sets=prepared_sample_sets)
df_samples = df_samples.merge(
df_cohorts, on="sample_id", sort=False, how="left"
)
- # If surveillance_use_only, restrict to samples with is_surveillance.
- # Note: this will also be enforced via self._prep_sample_query_param().
- if "is_surveillance" in df_samples.columns and self._surveillance_use_only:
- df_samples = df_samples[df_samples["is_surveillance"].astype(bool)]
-
# Store sample metadata in the cache.
self._cache_sample_metadata[cache_key] = df_samples
@@ -717,10 +731,10 @@ def sample_metadata(
df_samples = df_samples.merge(data, how="left", on=on)
# Apply the sample_query or sample_indices, if specified.
- if sample_query_prepped is not None:
+ if prepared_sample_query is not None:
# Assume a pandas query string.
sample_query_options = sample_query_options or {}
- df_samples = df_samples.query(sample_query_prepped, **sample_query_options)
+ df_samples = df_samples.query(prepared_sample_query, **sample_query_options)
df_samples = df_samples.reset_index(drop=True)
elif sample_indices is not None:
# Assume it is an indexer.
@@ -940,6 +954,24 @@ def wgs_data_catalog(self, sample_set: base_params.sample_set):
]
]
+ # If `_unrestricted_use_only` is `True`, then only return data if this sample set has `unrestricted_use` set to `True`.
+ if self._unrestricted_use_only and not self._sample_set_has_unrestricted_use(
+ sample_set=sample_set
+ ):
+ # Remove all the data from the DataFrame and reset its index.
+ df = df.iloc[0:0].reset_index(drop=True)
+
+ # If `_surveillance_use_only` is `True`, then only return samples that have `is_surveillance` set to `True`.
+ if self._surveillance_use_only:
+ surveillance_flags_df = self._surveillance_flags(sample_sets=[sample_set])
+ df = df.merge(
+ surveillance_flags_df[["sample_id", "is_surveillance"]],
+ on="sample_id",
+ how="left",
+ )
+ df = df[df["is_surveillance"]]
+ df = df.drop(columns=["is_surveillance"])
+
return df
@check_types
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
index 7536a927e..c2c6c105c 100644
--- a/tests/anoph/test_sample_metadata.py
+++ b/tests/anoph/test_sample_metadata.py
@@ -316,7 +316,7 @@ def test_general_metadata_with_multiple_sample_sets(
@parametrize_with_cases("fixture,api", cases=".")
def test_general_metadata_with_release(fixture, api: AnophelesSampleMetadata):
# Set up the test.
- release = random.choice(api.relevant_releases)
+ release = random.choice(api.releases)
# Call function to be tested.
df = api.general_metadata(sample_sets=release)
@@ -400,7 +400,7 @@ def test_sequence_qc_metadata_with_multiple_sample_sets(
@parametrize_with_cases("fixture,api", cases=".")
def test_sequence_qc_metadata_with_release(fixture, api: AnophelesSampleMetadata):
# Set up the test.
- release = random.choice(api.relevant_releases)
+ release = random.choice(api.releases)
# Call function to be tested.
df = api.sequence_qc_metadata(sample_sets=release)
@@ -619,7 +619,7 @@ def test_cohorts_metadata_with_multiple_sample_sets(
@parametrize_with_cases("fixture,api", cases=".")
def test_cohorts_metadata_with_release(fixture, api: AnophelesSampleMetadata):
# Set up test.
- release = random.choice(api.relevant_releases)
+ release = random.choice(api.releases)
# Call function to be tested.
df = api.cohorts_metadata(sample_sets=release)
@@ -735,7 +735,7 @@ def test_sample_metadata_with_multiple_sample_sets(
@parametrize_with_cases("fixture,api", cases=".")
def test_sample_metadata_with_release(fixture, api: AnophelesSampleMetadata):
# Set up test.
- release = random.choice(api.relevant_releases)
+ release = random.choice(api.releases)
# Call function to be tested.
df = api.sample_metadata(sample_sets=release)
@@ -759,7 +759,7 @@ def test_sample_metadata_with_duplicate_sample_sets(
fixture, api: AnophelesSampleMetadata
):
# Set up test.
- release = random.choice(api.relevant_releases)
+ release = random.choice(api.releases)
df_sample_sets = api.sample_sets(release=release).set_index("sample_set")
all_sample_sets = df_sample_sets.index.to_list()
sample_set = random.choice(all_sample_sets)
From d125707fd3de664d8b047d342b5ffbe2ddfb5d66 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 23 May 2025 16:24:21 +0100
Subject: [PATCH 11/32] WIP: amend data types
---
malariagen_data/anoph/base.py | 8 +++++---
malariagen_data/anoph/sample_metadata.py | 16 ++++++++--------
2 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index b17bc6603..eb7ba8d6a 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -405,14 +405,14 @@ def _relevant_releases(self) -> Tuple[str, ...]:
if self._cache_releases is None:
# Start a list of the relevant releases.
- relevant_releases = []
+ relevant_releases = [] # type: List[str]
# Get the available releases, which depends on the `pre` setting.
available_releases = self._available_releases
# If there are no criteria, then all available releases are relevant.
if not self._unrestricted_use_only and not self._surveillance_use_only:
- relevant_releases = available_releases
+ relevant_releases = list(available_releases)
elif self._unrestricted_use_only and not self._surveillance_use_only:
# Get the releases with unrestricted data.
@@ -494,6 +494,9 @@ def client_location(self) -> str:
location = "unknown"
return location
+ def _surveillance_flags(self, sample_sets: List[str]):
+ raise NotImplementedError("Subclasses must implement `_surveillance_flags`.")
+
def _release_has_unrestricted_data(self, *, release: str):
"""Return `True` if the specified release has any unrestricted data. Otherwise return `False`."""
@@ -526,7 +529,6 @@ def _release_has_surveillance_data(self, *, release: str):
sample_sets = sample_sets_manifest_df["sample_set"].to_list()
# Determine whether any of the sample sets have surveillance data.
- # Note: rather than using `_surveillance_flags`, to avoid unnecessary processing, we only need to find one sample set.
release_has_surveillance_data = False
for sample_set in sample_sets:
if self._sample_set_has_surveillance_data(sample_set=sample_set):
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 7574a1f0f..a5905e020 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -90,13 +90,13 @@ def _parse_metadata_paths(
self,
path_template: str,
parse_metadata_func: Callable[[str, Union[bytes, Exception]], pd.DataFrame],
- sample_sets: Optional[base_params.sample_sets] = None,
+ sample_sets: List[str],
aim_analysis: Optional[str] = None,
cohorts_analysis: Optional[str] = None,
) -> pd.DataFrame:
- # Warning: don't use `_prep_sample_sets_param` in this function because that can cause a circular dependency, eventually raising a RecursionError.
- # For instance, `_prep_sample_sets_param` uses `_relevant_sample_sets`, which uses `_surveillance_flags, which uses `_parse_metadata_paths`.
- # Instead, use `_prep_sample_sets_param` to prepare `sample_sets` before passing it to this function.
+ # Note: we don't use `_prep_sample_sets_param` in this function because that can cause a circular dependency, eventually raising a `RecursionError`.
+ # For instance, `_prep_sample_sets_param` uses `_relevant_sample_sets`, which uses `_surveillance_flags`, which uses `_parse_metadata_paths`.
+ # Instead, use `_prep_sample_sets_param` to prepare `sample_sets` as a `List[str]` before passing it to this function.
# Obtain paths for all files we need to fetch.
file_paths: Mapping[str, str] = self._metadata_paths(
@@ -410,10 +410,10 @@ def _parse_surveillance_flags(
`is_surveillance` indicates whether the sample can be used for surveillance,
""",
)
- def _surveillance_flags(self, sample_sets: base_params.sample_sets) -> pd.DataFrame:
- # Warning: don't use `_prep_sample_sets_param` here, because `_prep_sample_sets_param` uses `_relevant_sample_sets`,
- # which uses this function, which would cause a RecursionError due to cyclic dependency.
- # Instead, prepare the `sample_sets` parameter before calling this function.
+ def _surveillance_flags(self, sample_sets: List[str]) -> pd.DataFrame:
+ # Note: we don't use `_prep_sample_sets_param` in this function because that can cause a circular dependency, eventually raising a `RecursionError`.
+ # For instance, `_prep_sample_sets_param` uses `_relevant_sample_sets`, which uses `_surveillance_flags`.
+ # Instead, use `_prep_sample_sets_param` to prepare `sample_sets` as a `List[str]` before passing it to this function.
return self._parse_metadata_paths(
path_template="{release_path}/metadata/general/{sample_set}/surveillance.flags.csv",
From a9f44c469bd41548976fbd05f85d5464cbed5ea1 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 23 May 2025 17:23:30 +0100
Subject: [PATCH 12/32] Add doc for _surveillance_flags sample_sets param
---
malariagen_data/anoph/sample_metadata.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index a5905e020..10c555aa9 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -405,6 +405,9 @@ def _parse_surveillance_flags(
summary="""
Access surveillance flags for one or more sample sets.
""",
+ parameters=dict(
+ sample_sets="List of sample sets.",
+ ),
returns="""A pandas DataFrame, one row per sample. The columns are:
`sample_id` is the identifier of the sample,
`is_surveillance` indicates whether the sample can be used for surveillance,
From 3bad01623639dc90108524ff0dc264fb7193fea2 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 29 May 2025 11:02:50 +0100
Subject: [PATCH 13/32] WIP: dev support for unrestricted_use_only,
surveillance_use_only
---
malariagen_data/anoph/aim_data.py | 6 ++---
malariagen_data/anoph/cnv_data.py | 42 ++++++++++++++++++++++++++++---
2 files changed, 41 insertions(+), 7 deletions(-)
diff --git a/malariagen_data/anoph/aim_data.py b/malariagen_data/anoph/aim_data.py
index 46f671e88..b215113fc 100644
--- a/malariagen_data/anoph/aim_data.py
+++ b/malariagen_data/anoph/aim_data.py
@@ -172,12 +172,10 @@ def aim_calls(
sample_query_options = sample_query_options or {}
# Determine which samples match the sample query.
- loc_samples = df_samples.eval(
- prepared_sample_query, **sample_query_options
- ).values
+ loc_samples = df_samples.eval(prepared_sample_query, **sample_query_options)
# Raise an error if no samples match the sample query.
- if np.count_nonzero(loc_samples) == 0:
+ if not loc_samples.any():
raise ValueError(
f"No samples found for query {prepared_sample_query!r}"
)
diff --git a/malariagen_data/anoph/cnv_data.py b/malariagen_data/anoph/cnv_data.py
index bfa9733e4..c76c7692c 100644
--- a/malariagen_data/anoph/cnv_data.py
+++ b/malariagen_data/anoph/cnv_data.py
@@ -256,10 +256,10 @@ def cnv_hmm(
# Determine which samples match the sample query.
loc_samples = df_samples.eval(
prepared_sample_query, **sample_query_options
- ).values
+ )
# Raise an error if no samples match the sample query.
- if np.count_nonzero(loc_samples) == 0:
+ if not loc_samples.any():
raise ValueError(
f"No samples found for query {prepared_sample_query!r}"
)
@@ -435,7 +435,11 @@ def cnv_coverage_calls(
debug("normalise parameters")
regions: List[Region] = parse_multi_region(self, region)
+ prepared_sample_set = self._prep_sample_sets_param(sample_sets=sample_set)[0]
+
+ # Delete original parameters to prevent accidental use.
del region
+ del sample_set
debug("access data and concatenate as needed")
lx = []
@@ -443,7 +447,7 @@ def cnv_coverage_calls(
debug("obtain coverage calls for the contig")
x = self._cnv_coverage_calls_dataset(
contig=r.contig,
- sample_set=sample_set,
+ sample_set=prepared_sample_set,
analysis=analysis,
inline_array=inline_array,
chunks=chunks,
@@ -462,6 +466,38 @@ def cnv_coverage_calls(
lx.append(x)
ds = simple_xarray_concat(lx, dim=DIM_VARIANT)
+ # Filter the samples using this default sample query.
+ # For example, this might filter out non-surveillance samples.
+ prepared_sample_query = self._prep_sample_query_param(sample_query="")
+
+ # Get the relevant sample metadata.
+ df_samples = self.sample_metadata(sample_sets=prepared_sample_set)
+
+ # Determine which samples match the sample query.
+ if prepared_sample_query != "":
+ loc_samples = df_samples.eval(prepared_sample_query)
+ else:
+ loc_samples = pd.Series(True, index=df_samples.index)
+
+ # Raise an error if no samples match the sample query.
+ if not loc_samples.any():
+ raise ValueError(f"No samples found for query {prepared_sample_query!r}")
+
+ # Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
+ relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values
+
+ # Get all the sample ids from the unfiltered CNV coverage calls Dataset.
+ ds_sample_ids = ds.coords["sample_id"].values
+
+ # Get the indices of samples in the CNV coverage calls Dataset that match the relevant sample ids.
+ # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
+ relevant_sample_indices = np.where(np.isin(ds_sample_ids, relevant_sample_ids))[
+ 0
+ ]
+
+ # Select only the relevant samples from the CNV coverage calls Dataset.
+ ds = ds.isel(samples=relevant_sample_indices)
+
return ds
@check_types
From 88347b02ec43d7daf65fdbb910426de9b7f828a2 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 29 May 2025 11:37:12 +0100
Subject: [PATCH 14/32] WIP: update cnv_discordant_read_calls to honour
constructor params
---
malariagen_data/anoph/cnv_data.py | 58 ++++++++++++++++++++-----------
1 file changed, 37 insertions(+), 21 deletions(-)
diff --git a/malariagen_data/anoph/cnv_data.py b/malariagen_data/anoph/cnv_data.py
index c76c7692c..65c3edfad 100644
--- a/malariagen_data/anoph/cnv_data.py
+++ b/malariagen_data/anoph/cnv_data.py
@@ -200,6 +200,10 @@ def cnv_hmm(
prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
regions: List[Region] = parse_multi_region(self, region)
+
+ # Delete original parameters to prevent accidental use.
+ del sample_sets
+ del sample_query
del region
with self._spinner("Access CNV HMM data"):
@@ -244,7 +248,6 @@ def cnv_hmm(
ds = simple_xarray_concat(lx, dim=DIM_VARIANT)
debug("handle sample query")
-
# If there's a sample query...
if prepared_sample_query is not None:
# Get the relevant sample metadata.
@@ -640,16 +643,20 @@ def cnv_discordant_read_calls(
# CNV alleles have unknown start or end coordinates.
debug("normalise parameters")
- sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
- sample_query = self._prep_sample_query_param(sample_query=sample_query)
+ prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
if isinstance(contig, str):
contig = [contig]
+ # Delete original parameters to prevent accidental use.
+ del sample_sets
+ del sample_query
+
debug("access data and concatenate as needed")
lx = []
for c in contig:
ly = []
- for s in sample_sets:
+ for s in prepared_sample_sets:
y = self._cnv_discordant_read_calls_dataset(
contig=c,
sample_set=s,
@@ -673,30 +680,39 @@ def cnv_discordant_read_calls(
ds = simple_xarray_concat(lx, dim=DIM_VARIANT)
debug("handle sample query")
- if sample_query is not None:
+
+ # If there's a sample query...
+ if prepared_sample_query is not None:
debug("load sample metadata")
- df_samples = self.sample_metadata(sample_sets=sample_sets)
+ # Get the relevant sample metadata.
+ df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)
+
+ # If there are no sample query options, then default to an empty dict.
+ sample_query_options = sample_query_options or {}
+
+ # Determine which samples match the sample query.
+ loc_samples = df_samples.eval(prepared_sample_query, **sample_query_options)
- if df_samples.empty:
+ # Raise an error if no samples match the sample query.
+ if not loc_samples.any():
raise ValueError(
- f"No samples found for sample sets {sample_sets!r}. These samples might be unavailable or irrelevant with respect to settings."
+ f"No samples found for query {prepared_sample_query!r}"
)
- debug("align sample metadata with CNV data")
- cnv_samples = ds["sample_id"].values.tolist()
- df_samples_cnv = (
- df_samples.set_index("sample_id").loc[cnv_samples].reset_index()
- )
+ # Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
+ relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values
- debug("apply the query")
- sample_query_options = sample_query_options or {}
- loc_query_samples = df_samples_cnv.eval(
- sample_query, **sample_query_options
- ).values
- if np.count_nonzero(loc_query_samples) == 0:
- raise ValueError(f"No samples found for query {sample_query!r}")
+ # Get all the sample ids from the unfiltered CNV discordant reads Dataset.
+ ds_sample_ids = ds.coords["sample_id"].values
+
+ # Get the indices of samples in the CNV discordant reads Dataset that match the relevant sample ids.
+ # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
+ relevant_sample_indices = np.where(
+ np.isin(ds_sample_ids, relevant_sample_ids)
+ )[0]
- ds = ds.isel(samples=loc_query_samples)
+ # Select only the relevant samples from the CNV discordant reads Dataset.
+ ds = ds.isel(samples=relevant_sample_indices)
return ds
From 519fec263695409348eca234629971a65d118094 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 29 May 2025 12:57:13 +0100
Subject: [PATCH 15/32] Convert dtype dict to defaultdict for pd.read_csv
---
malariagen_data/anoph/sample_metadata.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 10c555aa9..690e940fa 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -2,6 +2,7 @@
from itertools import cycle
from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
import warnings
+from collections import defaultdict
import ipyleaflet # type: ignore
import numpy as np
@@ -141,6 +142,8 @@ def _parse_general_metadata(
"longitude": "float64",
"sex_call": "object",
}
+ # `dtype` of `dict[str, str]` is incompatible with `read_csv`
+ dtype = defaultdict(str, dtype)
df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
# Ensure all column names are lower case.
@@ -349,6 +352,8 @@ def _parse_surveillance_flags(
"sample_id": "object",
"is_surveillance": "boolean",
}
+ # `dtype` of `dict[str, str]` is incompatible with `read_csv`
+ dtype = defaultdict(str, dtype)
if isinstance(data, bytes):
# Read the CSV data.
From f64e64f70f83cc05c159de5496fcbb01543e692c Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 29 May 2025 12:59:00 +0100
Subject: [PATCH 16/32] Convert df.index.names to List[str] for list
---
malariagen_data/anoph/frq_base.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/malariagen_data/anoph/frq_base.py b/malariagen_data/anoph/frq_base.py
index 7f1448e77..fe39805e4 100644
--- a/malariagen_data/anoph/frq_base.py
+++ b/malariagen_data/anoph/frq_base.py
@@ -210,7 +210,9 @@ def plot_frequencies_heatmap(
# Indexing.
if index is None:
- index = list(df.index.names)
+ # `list[Hashable]` is incompatible with `list`
+ index_names_as_list = [str(name) for name in df.index.names]
+ index = list(index_names_as_list)
df = df.reset_index().copy()
if isinstance(index, list):
index_col = (
From 613185c47f5b33fe1e918201c1574538c5c4eeee Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 29 May 2025 13:01:11 +0100
Subject: [PATCH 17/32] Ignore type check for untyped array comparison in
test_snp_frq.py. Rename repeated expected_alleles var.
---
tests/anoph/test_snp_frq.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/tests/anoph/test_snp_frq.py b/tests/anoph/test_snp_frq.py
index aa803f031..7a98c4628 100644
--- a/tests/anoph/test_snp_frq.py
+++ b/tests/anoph/test_snp_frq.py
@@ -156,10 +156,10 @@ def test_snp_effects(fixture, api: AnophelesSnpFrequencyAnalysis):
position = df["position"].values
assert np.all(position >= transcript["start"])
assert np.all(position <= transcript["end"])
- assert np.all(position[1:] >= position[:-1])
- expected_alleles = list("ACGT")
- assert np.all(df["ref_allele"].isin(expected_alleles))
- assert np.all(df["alt_allele"].isin(expected_alleles))
+ assert np.all(position[1:] >= position[:-1]) # type: ignore
+ test_expected_alleles = list("ACGT")
+ assert np.all(df["ref_allele"].isin(test_expected_alleles))
+ assert np.all(df["alt_allele"].isin(test_expected_alleles))
assert np.all(df["transcript"] == transcript.name)
assert np.all(df["effect"].isin(expected_effects))
assert np.all(df["impact"].isin(expected_impacts))
From e041a912166e096344ab69e0face1bf9d06b5ff2 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 29 May 2025 14:28:09 +0100
Subject: [PATCH 18/32] Use defaultdict for _aim_metadata_dtype for pd.read_csv
---
malariagen_data/anoph/sample_metadata.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 690e940fa..8424f7d11 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -41,7 +41,8 @@ def __init__(
# data resources, and so column names and dtype need to be
# passed in as parameters.
self._aim_metadata_columns: Optional[List[str]] = None
- self._aim_metadata_dtype: Dict[str, Any] = dict()
+ # `dtype` of `dict[str, Any]` is incompatible with `read_csv`
+ self._aim_metadata_dtype: defaultdict[str, Any] = dict()
if isinstance(aim_metadata_dtype, Mapping):
self._aim_metadata_columns = list(aim_metadata_dtype.keys())
self._aim_metadata_dtype.update(aim_metadata_dtype)
From 82fa84df0b8c4c7832f0ab9108ab670ca844da4c Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 29 May 2025 14:34:57 +0100
Subject: [PATCH 19/32] Amend defaultdict assignment for _aim_metadata_dtype
---
malariagen_data/anoph/sample_metadata.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 8424f7d11..530a66479 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -42,7 +42,7 @@ def __init__(
# passed in as parameters.
self._aim_metadata_columns: Optional[List[str]] = None
# `dtype` of `dict[str, Any]` is incompatible with `read_csv`
- self._aim_metadata_dtype: defaultdict[str, Any] = dict()
+ self._aim_metadata_dtype: defaultdict[str, Any] = defaultdict()
if isinstance(aim_metadata_dtype, Mapping):
self._aim_metadata_columns = list(aim_metadata_dtype.keys())
self._aim_metadata_dtype.update(aim_metadata_dtype)
From 09f224a0419ed9657df165d9646159bf0f7b44ed Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 29 May 2025 15:25:56 +0100
Subject: [PATCH 20/32] Amend dtype data type for pd.read_csv
---
malariagen_data/anoph/sample_metadata.py | 48 +++++++++++++++++++-----
1 file changed, 38 insertions(+), 10 deletions(-)
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 530a66479..59e6d0777 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -1,8 +1,18 @@
import io
from itertools import cycle
-from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ List,
+ Mapping,
+ Optional,
+ Sequence,
+ Tuple,
+ Union,
+ cast,
+)
import warnings
-from collections import defaultdict
import ipyleaflet # type: ignore
import numpy as np
@@ -41,11 +51,21 @@ def __init__(
# data resources, and so column names and dtype need to be
# passed in as parameters.
self._aim_metadata_columns: Optional[List[str]] = None
- # `dtype` of `dict[str, Any]` is incompatible with `read_csv`
- self._aim_metadata_dtype: defaultdict[str, Any] = defaultdict()
+ # `dtype` of `dict[str, Any]` is incompatible with `pd.read_csv`
+ self._aim_metadata_dtype: Mapping[
+ str, Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype]
+ ] = {}
if isinstance(aim_metadata_dtype, Mapping):
self._aim_metadata_columns = list(aim_metadata_dtype.keys())
- self._aim_metadata_dtype.update(aim_metadata_dtype)
+ self._aim_metadata_dtype.update(
+ cast(
+ Mapping[
+ str,
+ Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype],
+ ],
+ aim_metadata_dtype,
+ )
+ )
self._aim_metadata_dtype["sample_id"] = "object"
# Set up taxon colors.
@@ -143,9 +163,14 @@ def _parse_general_metadata(
"longitude": "float64",
"sex_call": "object",
}
- # `dtype` of `dict[str, str]` is incompatible with `read_csv`
- dtype = defaultdict(str, dtype)
- df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
+ # `dtype` of `dict[str, str]` is incompatible with `pd.read_csv`
+ dtype_mapping = cast(
+ Mapping[
+ str, Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype]
+ ],
+ dtype,
+ )
+ df = pd.read_csv(io.BytesIO(data), dtype=dtype_mapping, na_values="")
# Ensure all column names are lower case.
df.columns = [c.lower() for c in df.columns] # type: ignore
@@ -349,12 +374,15 @@ def _parse_surveillance_flags(
# Specify the expected data type for each column.
# Note: "bool" is not nullable and does not support `NaN`, which is required when missing data.
# Otherwise `NaN` will be mis-translated to `True` when the dtype is applied to the DataFrame.
- dtype = {
+ dtype_dict = {
"sample_id": "object",
"is_surveillance": "boolean",
}
# `dtype` of `dict[str, str]` is incompatible with `read_csv`
- dtype = defaultdict(str, dtype)
+ dtype = cast(
+ Mapping[str, Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype]],
+ dtype_dict,
+ )
if isinstance(data, bytes):
# Read the CSV data.
From 21af2d2b944d181c8b12fbf3d472959cac297502 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 30 May 2025 13:03:08 +0100
Subject: [PATCH 21/32] Fix bug in applying aim_metadata_dtype. Amend data
types.
---
malariagen_data/ag3.py | 21 +++---
malariagen_data/anoph/frq_base.py | 3 +-
malariagen_data/anoph/sample_metadata.py | 82 +++++++++++++-----------
tests/anoph/test_sample_metadata.py | 2 +-
4 files changed, 60 insertions(+), 48 deletions(-)
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
index ad9363c2d..771b3e568 100644
--- a/malariagen_data/ag3.py
+++ b/malariagen_data/ag3.py
@@ -77,6 +77,18 @@ def _setup_aim_palettes():
"unassigned": "black",
}
+# Note: These column names will be treated as case-insensitive,
+# because these column names and the column names from the CSV
+# will be converted to lowercase before applying these dtypes.
+AIM_METADATA_DTYPE = {
+ "aim_species_fraction_arab": "float64",
+ "aim_species_fraction_colu": "float64",
+ "aim_species_fraction_colu_no2l": "float64",
+ "aim_species_gambcolu_arabiensis": "object",
+ "aim_species_gambiae_coluzzii": "object",
+ "aim_species": "object",
+}
+
class Ag3(AnophelesDataResource):
"""Provides access to data from Ag3.x releases.
@@ -162,14 +174,7 @@ def __init__(
config_path=CONFIG_PATH,
cohorts_analysis=cohorts_analysis,
aim_analysis=aim_analysis,
- aim_metadata_dtype={
- "aim_species_fraction_arab": "float64",
- "aim_species_fraction_colu": "float64",
- "aim_species_fraction_colu_no2l": "float64",
- "aim_species_gambcolu_arabiensis": "object",
- "aim_species_gambiae_coluzzii": "object",
- "aim_species": "object",
- },
+ aim_metadata_dtype=AIM_METADATA_DTYPE,
aim_ids=("gambcolu_vs_arab", "gamb_vs_colu"),
aim_palettes=AIM_PALETTES,
site_filters_analysis=site_filters_analysis,
diff --git a/malariagen_data/anoph/frq_base.py b/malariagen_data/anoph/frq_base.py
index fe39805e4..b5d85d6b7 100644
--- a/malariagen_data/anoph/frq_base.py
+++ b/malariagen_data/anoph/frq_base.py
@@ -210,7 +210,8 @@ def plot_frequencies_heatmap(
# Indexing.
if index is None:
- # `list[Hashable]` is incompatible with `list`
+ # `list[Hashable]` is incompatible with the param for `list`
+ # Convert `df.index.names` to a `list[str]` instead.
index_names_as_list = [str(name) for name in df.index.names]
index = list(index_names_as_list)
df = df.reset_index().copy()
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 59e6d0777..c06965cb9 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -3,6 +3,7 @@
from typing import (
Any,
Callable,
+ DefaultDict,
Dict,
List,
Mapping,
@@ -10,8 +11,8 @@
Sequence,
Tuple,
Union,
- cast,
)
+from collections import defaultdict
import warnings
import ipyleaflet # type: ignore
@@ -51,21 +52,22 @@ def __init__(
# data resources, and so column names and dtype need to be
# passed in as parameters.
self._aim_metadata_columns: Optional[List[str]] = None
- # `dtype` of `dict[str, Any]` is incompatible with `pd.read_csv`
- self._aim_metadata_dtype: Mapping[
- str, Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype]
- ] = {}
+ self._aim_metadata_dtype: Optional[Mapping[str, Any]] = {}
+
+ # Only apply the `aim_metadata_dtype` if it is a type of `Mapping`.
if isinstance(aim_metadata_dtype, Mapping):
- self._aim_metadata_columns = list(aim_metadata_dtype.keys())
- self._aim_metadata_dtype.update(
- cast(
- Mapping[
- str,
- Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype],
- ],
- aim_metadata_dtype,
- )
- )
+ # Convert all of the column names to lowercase.
+ prepared_aim_metadata_dtype_dict = {
+ k.lower(): v for k, v in aim_metadata_dtype.items()
+ }
+
+ # Get all the column names from the prepared dict.
+ self._aim_metadata_columns = list(prepared_aim_metadata_dtype_dict.keys())
+
+ # Update the _aim_metadata_dtype with the prepared dict.
+ self._aim_metadata_dtype.update(prepared_aim_metadata_dtype_dict)
+
+ # Add the sample_id to the _aim_metadata_dtype.
self._aim_metadata_dtype["sample_id"] = "object"
# Set up taxon colors.
@@ -151,7 +153,7 @@ def _parse_general_metadata(
self, sample_set: str, data: Union[bytes, Exception]
) -> pd.DataFrame:
if isinstance(data, bytes):
- dtype = {
+ dtype_dict = {
"sample_id": "object",
"partner_sample_id": "object",
"contributor": "object",
@@ -163,14 +165,9 @@ def _parse_general_metadata(
"longitude": "float64",
"sex_call": "object",
}
- # `dtype` of `dict[str, str]` is incompatible with `pd.read_csv`
- dtype_mapping = cast(
- Mapping[
- str, Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype]
- ],
- dtype,
- )
- df = pd.read_csv(io.BytesIO(data), dtype=dtype_mapping, na_values="")
+ # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
+ dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
+ df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
# Ensure all column names are lower case.
df.columns = [c.lower() for c in df.columns] # type: ignore
@@ -255,7 +252,10 @@ def _parse_sequence_qc_metadata(
) -> pd.DataFrame:
if isinstance(data, bytes):
# Get the dtype of the constant columns.
- dtype = self._sequence_qc_metadata_dtype
+ dtype_dict = self._sequence_qc_metadata_dtype
+
+ # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
+ dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
# Read the CSV using the dtype dict.
df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
@@ -272,8 +272,8 @@ def _parse_sequence_qc_metadata(
# Add the sequence QC columns with appropriate missing values.
# For each column, set the value to either NA or NaN.
- for c, dtype in self._sequence_qc_metadata_dtype.items():
- if pd.api.types.is_integer_dtype(dtype):
+ for c, datum_dtype in self._sequence_qc_metadata_dtype.items():
+ if pd.api.types.is_integer_dtype(datum_dtype):
# Note: this creates a column with dtype int64.
df[c] = -1
else:
@@ -378,11 +378,8 @@ def _parse_surveillance_flags(
"sample_id": "object",
"is_surveillance": "boolean",
}
- # `dtype` of `dict[str, str]` is incompatible with `read_csv`
- dtype = cast(
- Mapping[str, Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype]],
- dtype_dict,
- )
+ # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
+ dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
if isinstance(data, bytes):
# Read the CSV data.
@@ -516,7 +513,11 @@ def _parse_cohorts_metadata(
) -> pd.DataFrame:
if isinstance(data, bytes):
# Parse CSV data.
- dtype = self._cohorts_metadata_dtype
+ dtype_dict = self._cohorts_metadata_dtype
+
+ # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
+ dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
+
df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
# Ensure all column names are lower case.
@@ -590,14 +591,19 @@ def _parse_aim_metadata(
assert self._aim_metadata_columns is not None
assert self._aim_metadata_dtype is not None
if isinstance(data, bytes):
- # Parse CSV data.
- df = pd.read_csv(
- io.BytesIO(data), dtype=self._aim_metadata_dtype, na_values=""
- )
+ # Parse CSV data but don't apply the dtype yet.
+ df = pd.read_csv(io.BytesIO(data), na_values="")
- # Ensure all column names are lower case.
+ # Convert all column names to lowercase.
df.columns = [c.lower() for c in df.columns] # type: ignore
+ # For each column in the DataFrame...
+ for c in df.columns:
+ # Apply the corresponding dtype from `_aim_metadata_dtype`.
+ # Convert the type to a NumPy dtype.
+ col_dtype_as_np = np.dtype(self._aim_metadata_dtype[c])
+ df[c] = df[c].astype(col_dtype_as_np)
+
return df
elif isinstance(data, FileNotFoundError):
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
index c2c6c105c..3d334444d 100644
--- a/tests/anoph/test_sample_metadata.py
+++ b/tests/anoph/test_sample_metadata.py
@@ -268,7 +268,7 @@ def validate_metadata(df, expected_columns):
# Check column types.
for c in df.columns:
- assert df[c].dtype.kind == expected_columns[c]
+ assert df[c].dtype.kind == expected_columns[c], c
@parametrize_with_cases("fixture,api", cases=".")
From 67ffa4780900547587c5a2e64b2a260415a1ae71 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 30 May 2025 18:08:20 +0100
Subject: [PATCH 22/32] Raise ValueError when view_alignments is given
irrelevant sample
---
malariagen_data/anoph/igv.py | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/malariagen_data/anoph/igv.py b/malariagen_data/anoph/igv.py
index e0c8472b5..3ece028ea 100644
--- a/malariagen_data/anoph/igv.py
+++ b/malariagen_data/anoph/igv.py
@@ -84,7 +84,13 @@ def _igv_view_alignments_tracks(
visibility_window: int = 20_000,
):
# Look up sample set for sample.
- sample_rec = self.sample_metadata().set_index("sample_id").loc[sample]
+ try:
+ sample_rec = self.sample_metadata().set_index("sample_id").loc[sample]
+ except KeyError as e:
+ raise ValueError(
+ f"No data found for sample {sample!r}. This sample might be unavailable or irrelevant with respect to settings."
+ ) from e
+
sample_set = sample_rec["sample_set"]
# Load data catalog.
From 23e2e7c437202b620fd0758392c7bc0cd5e2cd65 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 30 May 2025 18:11:55 +0100
Subject: [PATCH 23/32] Use raise from when re-wrapping exceptions in base.py
to provide better traceback (Pylint raise-missing-from)
---
malariagen_data/anoph/base.py | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index eb7ba8d6a..ed60c3ae4 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -778,10 +778,10 @@ def lookup_release(self, sample_set: base_params.sample_set) -> str:
try:
return self._cache_sample_set_to_release[sample_set]
- except KeyError:
+ except KeyError as e:
raise ValueError(
f"No release found for sample set {sample_set!r}. This sample set might be unavailable or irrelevant with respect to settings."
- )
+ ) from e
@check_types
@doc(
@@ -794,8 +794,8 @@ def lookup_study(self, sample_set: base_params.sample_set) -> str:
self._cache_sample_set_to_study = df_sample_sets["study_id"].to_dict()
try:
return self._cache_sample_set_to_study[sample_set]
- except KeyError:
- raise ValueError(f"No study ID found for sample set {sample_set!r}")
+ except KeyError as e:
+ raise ValueError(f"No study ID found for sample set {sample_set!r}") from e
@check_types
@doc(
@@ -810,8 +810,10 @@ def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
].to_dict(orient="index")
try:
return self._cache_sample_set_to_study_info[sample_set]
- except KeyError:
- raise ValueError(f"No study info found for sample set {sample_set!r}")
+ except KeyError as e:
+ raise ValueError(
+ f"No study info found for sample set {sample_set!r}"
+ ) from e
@check_types
@doc(
@@ -830,10 +832,10 @@ def lookup_terms_of_use_info(self, sample_set: base_params.sample_set) -> dict:
].to_dict(orient="index")
try:
return self._cache_sample_set_to_terms_of_use_info[sample_set]
- except KeyError:
+ except KeyError as e:
raise ValueError(
f"No terms-of-use info found for sample set {sample_set!r}"
- )
+ ) from e
def _prep_sample_sets_param(
self, *, sample_sets: Optional[base_params.sample_sets]
From 65fd83c596dcebb5ef478d3dfe794d2ef5e4ff77 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Tue, 3 Jun 2025 11:18:19 +0100
Subject: [PATCH 24/32] WIP: dev support for surveillance_use_only,
unrestricted_use_only params
---
malariagen_data/anoph/aim_data.py | 29 ++------
malariagen_data/anoph/base.py | 40 +++++++++++
malariagen_data/anoph/cnv_data.py | 89 +++++-------------------
malariagen_data/anoph/distance.py | 10 ++-
malariagen_data/anoph/pca.py | 52 ++++++++------
malariagen_data/anoph/sample_metadata.py | 18 +++--
malariagen_data/anoph/snp_data.py | 78 ++++++++++++++-------
7 files changed, 167 insertions(+), 149 deletions(-)
diff --git a/malariagen_data/anoph/aim_data.py b/malariagen_data/anoph/aim_data.py
index b215113fc..318981147 100644
--- a/malariagen_data/anoph/aim_data.py
+++ b/malariagen_data/anoph/aim_data.py
@@ -171,29 +171,12 @@ def aim_calls(
# If there are no sample query options, then default to an empty dict.
sample_query_options = sample_query_options or {}
- # Determine which samples match the sample query.
- loc_samples = df_samples.eval(prepared_sample_query, **sample_query_options)
-
- # Raise an error if no samples match the sample query.
- if not loc_samples.any():
- raise ValueError(
- f"No samples found for query {prepared_sample_query!r}"
- )
-
- # Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
- relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values
-
- # Get all the sample ids from the unfiltered AIM calls Dataset.
- ds_sample_ids = ds.coords["sample_id"].values
-
- # Get the indices of samples in the AIM calls Dataset that match the relevant sample ids.
- # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
- relevant_sample_indices = np.where(
- np.isin(ds_sample_ids, relevant_sample_ids)
- )[0]
-
- # Select only the relevant samples from the AIM calls Dataset.
- ds = ds.isel(samples=relevant_sample_indices)
+ ds = self._filter_sample_dataset(
+ ds=ds,
+ df_samples=df_samples,
+ sample_query=prepared_sample_query,
+ sample_query_options=sample_query_options,
+ )
return ds
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index ed60c3ae4..da8792783 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -28,6 +28,7 @@
from tqdm.auto import tqdm as tqdm_auto # type: ignore
from tqdm.dask import TqdmCallback # type: ignore
from yaspin import yaspin # type: ignore
+import xarray as xr
from ..util import (
CacheMiss,
@@ -933,6 +934,45 @@ def _prep_sample_query_param(
return prepped_sample_query
+ def _filter_sample_dataset(
+ self,
+ *,
+ ds: xr.Dataset,
+ df_samples: pd.DataFrame,
+ sample_query: str,
+ sample_query_options: dict,
+ ) -> xr.Dataset:
+ """Filters the given Dataset using the given DataFrame and query."""
+
+ # Note: "prepare" the params before calling this function.
+
+ # Determine which samples match the sample query.
+ if sample_query != "":
+ loc_samples = df_samples.eval(sample_query, **sample_query_options)
+ else:
+ loc_samples = pd.Series(True, index=df_samples.index)
+
+ # Raise an error if no samples match the sample query.
+ if not loc_samples.any():
+ raise ValueError(f"No samples found for query {sample_query!r}")
+
+ # Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
+ relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values
+
+ # Get all the sample ids from the unfiltered Dataset.
+ ds_sample_ids = ds.coords["sample_id"].values
+
+ # Get the indices of samples in the Dataset that match the relevant sample ids.
+ # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
+ relevant_sample_indices = np.where(np.isin(ds_sample_ids, relevant_sample_ids))[
+ 0
+ ]
+
+ # Select only the relevant samples from the Dataset.
+ ds = ds.isel(samples=relevant_sample_indices)
+
+ return ds
+
def _results_cache_add_analysis_params(self, params: dict):
# Expect sub-classes will override to add any analysis parameters.
pass
diff --git a/malariagen_data/anoph/cnv_data.py b/malariagen_data/anoph/cnv_data.py
index 65c3edfad..2f8cca6c3 100644
--- a/malariagen_data/anoph/cnv_data.py
+++ b/malariagen_data/anoph/cnv_data.py
@@ -256,32 +256,13 @@ def cnv_hmm(
# If there are no sample query options, then default to an empty dict.
sample_query_options = sample_query_options or {}
- # Determine which samples match the sample query.
- loc_samples = df_samples.eval(
- prepared_sample_query, **sample_query_options
+ ds = self._filter_sample_dataset(
+ ds=ds,
+ df_samples=df_samples,
+ sample_query=prepared_sample_query,
+ sample_query_options=sample_query_options,
)
- # Raise an error if no samples match the sample query.
- if not loc_samples.any():
- raise ValueError(
- f"No samples found for query {prepared_sample_query!r}"
- )
-
- # Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
- relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values
-
- # Get all the sample ids from the unfiltered CNV HMM Dataset.
- ds_sample_ids = ds.coords["sample_id"].values
-
- # Get the indices of samples in the CNV HMM Dataset that match the relevant sample ids.
- # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
- relevant_sample_indices = np.where(
- np.isin(ds_sample_ids, relevant_sample_ids)
- )[0]
-
- # Select only the relevant samples from the CNV HMM Dataset.
- ds = ds.isel(samples=relevant_sample_indices)
-
debug("handle coverage variance filter")
if max_coverage_variance is not None:
cov_var = ds["sample_coverage_variance"].values
@@ -476,30 +457,15 @@ def cnv_coverage_calls(
# Get the relevant sample metadata.
df_samples = self.sample_metadata(sample_sets=prepared_sample_set)
- # Determine which samples match the sample query.
- if prepared_sample_query != "":
- loc_samples = df_samples.eval(prepared_sample_query)
- else:
- loc_samples = pd.Series(True, index=df_samples.index)
-
- # Raise an error if no samples match the sample query.
- if not loc_samples.any():
- raise ValueError(f"No samples found for query {prepared_sample_query!r}")
-
- # Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
- relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values
+ # If there is no sample query, then default to an empty str.
+ prepared_sample_query = prepared_sample_query or ""
- # Get all the sample ids from the unfiltered CNV coverage calls Dataset.
- ds_sample_ids = ds.coords["sample_id"].values
-
- # Get the indices of samples in the CNV coverage calls Dataset that match the relevant sample ids.
- # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
- relevant_sample_indices = np.where(np.isin(ds_sample_ids, relevant_sample_ids))[
- 0
- ]
-
- # Select only the relevant samples from the CNV coverage calls Dataset.
- ds = ds.isel(samples=relevant_sample_indices)
+ ds = self._filter_sample_dataset(
+ ds=ds,
+ df_samples=df_samples,
+ sample_query=prepared_sample_query,
+ sample_query_options={},
+ )
return ds
@@ -690,29 +656,12 @@ def cnv_discordant_read_calls(
# If there are no sample query options, then default to an empty dict.
sample_query_options = sample_query_options or {}
- # Determine which samples match the sample query.
- loc_samples = df_samples.eval(prepared_sample_query, **sample_query_options)
-
- # Raise an error if no samples match the sample query.
- if not loc_samples.any():
- raise ValueError(
- f"No samples found for query {prepared_sample_query!r}"
- )
-
- # Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
- relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values
-
- # Get all the sample ids from the unfiltered CNV discordant reads Dataset.
- ds_sample_ids = ds.coords["sample_id"].values
-
- # Get the indices of samples in the CNV discordant reads Dataset that match the relevant sample ids.
- # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
- relevant_sample_indices = np.where(
- np.isin(ds_sample_ids, relevant_sample_ids)
- )[0]
-
- # Select only the relevant samples from the CNV discordant reads Dataset.
- ds = ds.isel(samples=relevant_sample_indices)
+ ds = self._filter_sample_dataset(
+ ds=ds,
+ df_samples=df_samples,
+ sample_query=prepared_sample_query,
+ sample_query_options=sample_query_options,
+ )
return ds
diff --git a/malariagen_data/anoph/distance.py b/malariagen_data/anoph/distance.py
index 60d69edb8..f76116a62 100644
--- a/malariagen_data/anoph/distance.py
+++ b/malariagen_data/anoph/distance.py
@@ -115,7 +115,10 @@ def biallelic_diplotype_pairwise_distances(
# invalidate any previously cached data.
name = "biallelic_diplotype_pairwise_distances"
- # Normalize params for consistent hash value.
+ ## Normalize params for consistent hash value.
+
+ # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
+ # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
(
sample_sets_prepped,
sample_indices_prepped,
@@ -269,7 +272,10 @@ def njt(
# invalidate any previously cached data.
name = "njt_v1"
- # Normalize params for consistent hash value.
+ ## Normalize params for consistent hash value.
+
+ # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
+ # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
(
sample_sets_prepped,
sample_indices_prepped,
diff --git a/malariagen_data/anoph/pca.py b/malariagen_data/anoph/pca.py
index 18cd0bdb3..9bbcdcd3f 100644
--- a/malariagen_data/anoph/pca.py
+++ b/malariagen_data/anoph/pca.py
@@ -80,27 +80,39 @@ def pca(
) -> Tuple[pca_params.df_pca, pca_params.evr]:
# Change this name if you ever change the behaviour of this function, to
# invalidate any previously cached data.
- name = "pca_v4"
+ name = "pca_v5"
- # Normalize params for consistent hash value.
+ ## Normalize params for consistent hash value.
+
+ # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
+ # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
(
- sample_sets_prepped,
- sample_indices_prepped,
+ prepared_sample_sets,
+ prepared_sample_indices,
) = self._prep_sample_selection_cache_params(
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
sample_indices=sample_indices,
)
- region_prepped = self._prep_region_cache_param(region=region)
- site_mask_prepped = self._prep_optional_site_mask_param(site_mask=site_mask)
+ prepared_region = self._prep_region_cache_param(region=region)
+ prepared_site_mask = self._prep_optional_site_mask_param(site_mask=site_mask)
+
+ # Delete original parameters to prevent accidental use.
+ del sample_sets
+ del sample_indices
+ del sample_query
+ del sample_query_options
+ del region
+ del site_mask
+
params = dict(
- region=region_prepped,
+ region=prepared_region,
n_snps=n_snps,
thin_offset=thin_offset,
- sample_sets=sample_sets_prepped,
- sample_indices=sample_indices_prepped,
- site_mask=site_mask_prepped,
+ sample_sets=prepared_sample_sets,
+ sample_indices=prepared_sample_indices,
+ site_mask=prepared_site_mask,
site_class=site_class,
min_minor_ac=min_minor_ac,
max_missing_an=max_missing_an,
@@ -127,22 +139,18 @@ def pca(
samples = results["samples"]
loc_keep_fit = results["loc_keep_fit"]
- # Load sample metadata.
- df_samples = self.sample_metadata(
- sample_sets=sample_sets,
- )
+ # Create a new DataFrame containing the PCA coords data.
+ df_pca = pd.DataFrame(coords, index=samples)
- # Ensure aligned with genotype data.
- df_samples = df_samples.set_index("sample_id").loc[samples].reset_index()
+ # Name the DataFrame's columns PC1, PC2, etc.
+ df_pca.columns = pd.Index([f"PC{i+1}" for i in range(coords.shape[1])])
- # Combine coords and sample metadata.
- df_coords = pd.DataFrame(
- {f"PC{i + 1}": coords[:, i] for i in range(coords.shape[1])}
- )
- df_pca = df_samples.join(df_coords, how="inner")
- # Add a column for which samples were included in fitting.
+ # Add a column to indicate which samples were included in fitting.
df_pca["pca_fit"] = loc_keep_fit
+ # Name the index.
+ df_pca.index.name = "sample_id"
+
return df_pca, evr
def _pca(
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index c06965cb9..2a47f731e 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -1054,20 +1054,26 @@ def _prep_sample_selection_cache_params(
sample_indices: Optional[base_params.sample_indices],
) -> Tuple[List[str], Optional[List[int]]]:
# Normalise sample sets.
- sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
- sample_query = self._prep_sample_query_param(sample_query=sample_query)
+ prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
- if sample_query is not None:
+ # Delete original parameters to prevent accidental use.
+ del sample_sets
+ del sample_query
+
+ if prepared_sample_query is not None:
# Resolve query to a list of integers for more cache hits - we
# do this because there are different ways to write the same pandas
# query, and so it's better to evaluate the query and use a list of
# integer indices instead.
- df_samples = self.sample_metadata(sample_sets=sample_sets)
+ df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)
sample_query_options = sample_query_options or {}
- loc_samples = df_samples.eval(sample_query, **sample_query_options).values
+ loc_samples = df_samples.eval(
+ prepared_sample_query, **sample_query_options
+ ).values
sample_indices = np.nonzero(loc_samples)[0].tolist()
- return sample_sets, sample_indices
+ return prepared_sample_sets, sample_indices
def _results_cache_add_analysis_params(self, params: dict):
super()._results_cache_add_analysis_params(params)
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
index 7b01ee7b4..95ec33e90 100644
--- a/malariagen_data/anoph/snp_data.py
+++ b/malariagen_data/anoph/snp_data.py
@@ -1007,29 +1007,34 @@ def snp_calls(
)
# Normalise parameters.
- sample_sets_prepped: Tuple[str, ...] = tuple(
+ prepared_regions: Tuple[Region, ...] = tuple(parse_multi_region(self, region))
+ prepared_sample_sets: Tuple[str, ...] = tuple(
self._prep_sample_sets_param(sample_sets=sample_sets)
)
- del sample_sets
+
sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
- del sample_query
+
if sample_indices is not None:
- sample_indices_prepped: Optional[Tuple[int, ...]] = tuple(sample_indices)
+ prepared_sample_indices: Optional[Tuple[int, ...]] = tuple(sample_indices)
else:
- sample_indices_prepped = sample_indices
+ prepared_sample_indices = sample_indices
+
+ prepared_site_mask = self._prep_optional_site_mask_param(site_mask=site_mask)
+
+ # Delete original parameters to prevent accidental use.
+ del sample_sets
+ del sample_query
del sample_indices
- regions: Tuple[Region, ...] = tuple(parse_multi_region(self, region))
del region
- site_mask_prepped = self._prep_optional_site_mask_param(site_mask=site_mask)
del site_mask
return self._snp_calls(
- regions=regions,
- sample_sets=sample_sets_prepped,
+ regions=prepared_regions,
+ sample_sets=prepared_sample_sets,
sample_query=sample_query_prepped,
sample_query_options=sample_query_options,
- sample_indices=sample_indices_prepped,
- site_mask=site_mask_prepped,
+ sample_indices=prepared_sample_indices,
+ site_mask=prepared_site_mask,
site_class=site_class,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
@@ -1134,7 +1139,10 @@ def _snp_calls(
inline_array,
chunks,
):
+ # Note: sample_sets and sample_query should be "prepared" before being passed to this private function.
+
# Get SNP calls and concatenate multiple sample sets and/or regions.
+ # Note: we don't cache different sample_query or sample_indices subsets.
ds = self._cached_snp_calls(
regions=regions,
sample_sets=sample_sets,
@@ -1146,12 +1154,19 @@ def _snp_calls(
# Handle sample selection.
if sample_query is not None:
+ # Get the relevant sample metadata.
df_samples = self.sample_metadata(sample_sets=sample_sets)
+
+ # If there are no sample query options, then default to an empty dict.
sample_query_options = sample_query_options or {}
- loc_samples = df_samples.eval(sample_query, **sample_query_options).values
- if np.count_nonzero(loc_samples) == 0:
- raise ValueError(f"No samples found for query {sample_query!r}")
- ds = ds.isel(samples=loc_samples)
+
+ ds = self._filter_sample_dataset(
+ ds=ds,
+ df_samples=df_samples,
+ sample_query=sample_query,
+ sample_query_options=sample_query_options,
+ )
+
elif sample_indices is not None:
ds = ds.isel(samples=list(sample_indices))
@@ -1287,7 +1302,10 @@ def snp_allele_counts(
# to invalidate any previously cached data.
name = "snp_allele_counts_v2"
- # Normalize params for consistent hash value.
+ ## Normalize params for consistent hash value.
+
+ # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
+ # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
(
sample_sets_prepped,
sample_indices_prepped,
@@ -1829,33 +1847,39 @@ def biallelic_diplotypes(
) -> Tuple[np.ndarray, np.ndarray]:
# Change this name if you ever change the behaviour of this function, to
# invalidate any previously cached data.
- name = "biallelic_diplotypes"
+ name = "biallelic_diplotypes_v2"
- # Normalize params for consistent hash value.
+ ## Normalize params for consistent hash value.
+
+ # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
+ # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
(
- sample_sets_prepped,
- sample_indices_prepped,
+ prepared_sample_sets,
+ prepared_sample_indices,
) = self._prep_sample_selection_cache_params(
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
sample_indices=sample_indices,
)
- region_prepped = self._prep_region_cache_param(region=region)
- site_mask_prepped = self._prep_optional_site_mask_param(site_mask=site_mask)
+ prepared_region = self._prep_region_cache_param(region=region)
+ prepared_site_mask = self._prep_optional_site_mask_param(site_mask=site_mask)
+
+ # Delete original parameters to prevent accidental use.
del sample_sets
del sample_query
del sample_query_options
del sample_indices
del region
del site_mask
+
params = dict(
- region=region_prepped,
+ region=prepared_region,
n_snps=n_snps,
thin_offset=thin_offset,
- sample_sets=sample_sets_prepped,
- sample_indices=sample_indices_prepped,
- site_mask=site_mask_prepped,
+ sample_sets=prepared_sample_sets,
+ sample_indices=prepared_sample_indices,
+ site_mask=prepared_site_mask,
site_class=site_class,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
@@ -1900,6 +1924,8 @@ def _biallelic_diplotypes(
inline_array,
chunks,
):
+ # Note: this uses sample_indices and should not expect a sample_query.
+
# Access biallelic SNPs.
ds = self.biallelic_snp_calls(
region=region,
From 7fce9bb0dbab2241368a85329a2ce766194580b2 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Tue, 3 Jun 2025 11:24:08 +0100
Subject: [PATCH 25/32] Revert cache name for biallelic_diplotypes. (Function
behaviour unchanged.)
---
malariagen_data/anoph/snp_data.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
index 95ec33e90..590874459 100644
--- a/malariagen_data/anoph/snp_data.py
+++ b/malariagen_data/anoph/snp_data.py
@@ -1847,7 +1847,7 @@ def biallelic_diplotypes(
) -> Tuple[np.ndarray, np.ndarray]:
# Change this name if you ever change the behaviour of this function, to
# invalidate any previously cached data.
- name = "biallelic_diplotypes_v2"
+ name = "biallelic_diplotypes"
## Normalize params for consistent hash value.
From df207c8c24b0bd353ee13b4eeb306290c1bd5568 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Tue, 3 Jun 2025 15:18:30 +0100
Subject: [PATCH 26/32] WIP: dev support for surveillance_use_only,
unrestricted_use_only params
---
malariagen_data/anoph/pca.py | 21 ++++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/malariagen_data/anoph/pca.py b/malariagen_data/anoph/pca.py
index 9bbcdcd3f..4b73b84d2 100644
--- a/malariagen_data/anoph/pca.py
+++ b/malariagen_data/anoph/pca.py
@@ -142,14 +142,29 @@ def pca(
# Create a new DataFrame containing the PCA coords data.
df_pca = pd.DataFrame(coords, index=samples)
- # Name the DataFrame's columns PC1, PC2, etc.
+ # Name the index of the PCA data and set it to a string type.
+ df_pca.index.name = "sample_id"
+ # df_pca.index = df_pca.index.astype(str)
+
+ # Name the DataFrame's columns as PC1, PC2, etc.
df_pca.columns = pd.Index([f"PC{i+1}" for i in range(coords.shape[1])])
+ # Load the sample metadata.
+ df_samples = self.sample_metadata(
+ sample_sets=prepared_sample_sets,
+ )
+
+ # Set the index of the sample metadata.
+ df_samples.set_index("sample_id", inplace=True)
+
+ # Join the relevant sample metadata.
+ df_pca = df_pca.join(df_samples, how="left", on="sample_id")
+
# Add a column to indicate which samples were included in fitting.
df_pca["pca_fit"] = loc_keep_fit
- # Name the index.
- df_pca.index.name = "sample_id"
+ # Keep "sample_id" as a column, so that it can be specified as a `hover_name` in `plot_pca_coords`, etc.
+ df_pca.reset_index(inplace=True)
return df_pca, evr
From d927b063762f9cb4299a3c8e40a73959e994a6f3 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Tue, 3 Jun 2025 18:22:08 +0100
Subject: [PATCH 27/32] Fix misspelling
---
malariagen_data/anoph/sample_metadata.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 2a47f731e..e9a6e78b9 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -1345,7 +1345,7 @@ def _setup_cohort_queries(
cohort_size: Optional[base_params.cohort_size],
min_cohort_size: Optional[base_params.min_cohort_size],
):
- """Convenience function to normalise the `cohorts` paramater to a
+ """Convenience function to normalise the `cohorts` parameter to a
dictionary mapping cohort labels to sample metadata queries."""
if isinstance(cohorts, dict):
From 1177e63422920532ea58f12a4eb908cb550ae290 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 5 Jun 2025 17:01:09 +0100
Subject: [PATCH 28/32] Use python engine for sample_query to support extension
dtypes
---
malariagen_data/anoph/base.py | 5 ++++-
malariagen_data/anoph/sample_metadata.py | 8 ++++++--
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index da8792783..77150b58e 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -948,7 +948,10 @@ def _filter_sample_dataset(
# Determine which samples match the sample query.
if sample_query != "":
- loc_samples = df_samples.eval(sample_query, **sample_query_options)
+ # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
+ loc_samples = df_samples.eval(
+ sample_query, **sample_query_options, engine="python"
+ )
else:
loc_samples = pd.Series(True, index=df_samples.index)
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index e9a6e78b9..6403c9d53 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -777,7 +777,10 @@ def sample_metadata(
if prepared_sample_query is not None:
# Assume a pandas query string.
sample_query_options = sample_query_options or {}
- df_samples = df_samples.query(prepared_sample_query, **sample_query_options)
+ # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
+ df_samples = df_samples.query(
+ prepared_sample_query, **sample_query_options, engine="python"
+ )
df_samples = df_samples.reset_index(drop=True)
elif sample_indices is not None:
# Assume it is an indexer.
@@ -1068,8 +1071,9 @@ def _prep_sample_selection_cache_params(
# integer indices instead.
df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)
sample_query_options = sample_query_options or {}
+ # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
loc_samples = df_samples.eval(
- prepared_sample_query, **sample_query_options
+ prepared_sample_query, **sample_query_options, engine="python"
).values
sample_indices = np.nonzero(loc_samples)[0].tolist()
From d9ba5ebcf32d88a855b0d7975eac4ec539113733 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Tue, 10 Jun 2025 12:05:32 +0100
Subject: [PATCH 29/32] Add validate_sample_selection_params to funcs with
sample_query, sample_indices. Add sample_query_options to
biallelic_snps_to_plink.
---
malariagen_data/anoph/distance.py | 15 +++++++++++++++
malariagen_data/anoph/pca.py | 5 +++++
malariagen_data/anoph/snp_data.py | 19 +++++++++++++++++--
malariagen_data/anoph/to_plink.py | 7 +++++++
4 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/malariagen_data/anoph/distance.py b/malariagen_data/anoph/distance.py
index f76116a62..038f041f9 100644
--- a/malariagen_data/anoph/distance.py
+++ b/malariagen_data/anoph/distance.py
@@ -115,6 +115,11 @@ def biallelic_diplotype_pairwise_distances(
# invalidate any previously cached data.
name = "biallelic_diplotype_pairwise_distances"
+ # Check that either sample_query xor sample_indices are provided.
+ base_params.validate_sample_selection_params(
+ sample_query=sample_query, sample_indices=sample_indices
+ )
+
## Normalize params for consistent hash value.
# Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
@@ -272,6 +277,11 @@ def njt(
# invalidate any previously cached data.
name = "njt_v1"
+ # Check that either sample_query xor sample_indices are provided.
+ base_params.validate_sample_selection_params(
+ sample_query=sample_query, sample_indices=sample_indices
+ )
+
## Normalize params for consistent hash value.
# Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
@@ -458,6 +468,11 @@ def plot_njt(
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.native_chunks,
) -> plotly_params.figure:
+ # Check that either sample_query xor sample_indices are provided.
+ base_params.validate_sample_selection_params(
+ sample_query=sample_query, sample_indices=sample_indices
+ )
+
# Only import anjl if needed, as it requires a couple of seconds to compile
# functions.
import anjl # type: ignore
diff --git a/malariagen_data/anoph/pca.py b/malariagen_data/anoph/pca.py
index 4b73b84d2..24e98087b 100644
--- a/malariagen_data/anoph/pca.py
+++ b/malariagen_data/anoph/pca.py
@@ -82,6 +82,11 @@ def pca(
# invalidate any previously cached data.
name = "pca_v5"
+ # Check that either sample_query xor sample_indices are provided.
+ base_params.validate_sample_selection_params(
+ sample_query=sample_query, sample_indices=sample_indices
+ )
+
## Normalize params for consistent hash value.
# Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
index 590874459..b4e35363f 100644
--- a/malariagen_data/anoph/snp_data.py
+++ b/malariagen_data/anoph/snp_data.py
@@ -456,7 +456,7 @@ def snp_genotypes(
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.native_chunks,
) -> da.Array:
- # Additional parameter checks.
+ # Check that either sample_query xor sample_indices are provided.
base_params.validate_sample_selection_params(
sample_query=sample_query, sample_indices=sample_indices
)
@@ -1001,7 +1001,7 @@ def snp_calls(
max_cohort_size: Optional[base_params.max_cohort_size] = None,
random_seed: base_params.random_seed = 42,
) -> xr.Dataset:
- # Additional parameter checks.
+ # Check that either sample_query xor sample_indices are provided.
base_params.validate_sample_selection_params(
sample_query=sample_query, sample_indices=sample_indices
)
@@ -1302,6 +1302,11 @@ def snp_allele_counts(
# to invalidate any previously cached data.
name = "snp_allele_counts_v2"
+ # Check that either sample_query xor sample_indices are provided.
+ base_params.validate_sample_selection_params(
+ sample_query=sample_query, sample_indices=sample_indices
+ )
+
## Normalize params for consistent hash value.
# Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
@@ -1688,6 +1693,11 @@ def biallelic_snp_calls(
n_snps: Optional[base_params.n_snps] = None,
thin_offset: base_params.thin_offset = 0,
) -> xr.Dataset:
+ # Check that either sample_query xor sample_indices are provided.
+ base_params.validate_sample_selection_params(
+ sample_query=sample_query, sample_indices=sample_indices
+ )
+
# Perform an allele count.
ac = self.snp_allele_counts(
region=region,
@@ -1849,6 +1859,11 @@ def biallelic_diplotypes(
# invalidate any previously cached data.
name = "biallelic_diplotypes"
+ # Check that either sample_query xor sample_indices are provided.
+ base_params.validate_sample_selection_params(
+ sample_query=sample_query, sample_indices=sample_indices
+ )
+
## Normalize params for consistent hash value.
# Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
diff --git a/malariagen_data/anoph/to_plink.py b/malariagen_data/anoph/to_plink.py
index dda65281e..0ed51fccd 100644
--- a/malariagen_data/anoph/to_plink.py
+++ b/malariagen_data/anoph/to_plink.py
@@ -57,6 +57,7 @@ def biallelic_snps_to_plink(
thin_offset: base_params.thin_offset = 0,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
+ sample_query_options: Optional[base_params.sample_query_options] = None,
sample_indices: Optional[base_params.sample_indices] = None,
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
min_minor_ac: Optional[
@@ -69,6 +70,11 @@ def biallelic_snps_to_plink(
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.native_chunks,
):
+ # Check that either sample_query xor sample_indices are provided.
+ base_params.validate_sample_selection_params(
+ sample_query=sample_query, sample_indices=sample_indices
+ )
+
# Define output files
plink_file_path = f"{output_dir}/{region}.{n_snps}.{min_minor_ac}.{max_missing_an}.{thin_offset}"
@@ -84,6 +90,7 @@ def biallelic_snps_to_plink(
region=region,
sample_sets=sample_sets,
sample_query=sample_query,
+ sample_query_options=sample_query_options,
sample_indices=sample_indices,
site_mask=site_mask,
min_minor_ac=min_minor_ac,
From 9b7d6cc31d43897f0ad8856db97b7279e138b925 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Tue, 10 Jun 2025 12:17:28 +0100
Subject: [PATCH 30/32] Amend sample_metadata to allow sample_indices when
surveillance_use_only
---
malariagen_data/anoph/sample_metadata.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 6403c9d53..9908e213c 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -773,7 +773,8 @@ def sample_metadata(
for on, data in self._extra_metadata:
df_samples = df_samples.merge(data, how="left", on=on)
- # Apply the sample_query or sample_indices, if specified.
+ # Apply the sample_query, if there is one.
+ # Note: this might have been internally modified, e.g. `is_surveillance == True`.
if prepared_sample_query is not None:
# Assume a pandas query string.
sample_query_options = sample_query_options or {}
@@ -782,7 +783,10 @@ def sample_metadata(
prepared_sample_query, **sample_query_options, engine="python"
)
df_samples = df_samples.reset_index(drop=True)
- elif sample_indices is not None:
+
+ # Apply the sample_indices, if there are any.
+ # Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`.
+ if sample_indices is not None:
# Assume it is an indexer.
df_samples = df_samples.iloc[sample_indices]
df_samples = df_samples.reset_index(drop=True)
From 6c4e74f6075901313666c208514c4c7ee6f21cc0 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Fri, 13 Jun 2025 17:46:11 +0100
Subject: [PATCH 31/32] WIP: handle sample_indices when surveillance_use_only
---
malariagen_data/anoph/sample_metadata.py | 70 +++++++++++++--
malariagen_data/anoph/snp_data.py | 107 +++++++++++++++--------
2 files changed, 133 insertions(+), 44 deletions(-)
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
index 9908e213c..00bb10675 100644
--- a/malariagen_data/anoph/sample_metadata.py
+++ b/malariagen_data/anoph/sample_metadata.py
@@ -1060,6 +1060,36 @@ def _prep_sample_selection_cache_params(
sample_query_options: Optional[base_params.sample_query_options],
sample_indices: Optional[base_params.sample_indices],
) -> Tuple[List[str], Optional[List[int]]]:
+ # Check that either sample_query xor sample_indices are provided.
+ base_params.validate_sample_selection_params(
+ sample_query=sample_query, sample_indices=sample_indices
+ )
+
+ # Resolve query to a list of integers for more cache hits - we
+ # do this because there are different ways to write the same pandas
+ # query, and so it's better to evaluate the query and use a list of
+ # integer indices instead.
+
+ # Scenario 1: No `sample_query` nor `sample_indices` were given,
+ # and there is no internal `sample_query`,
+ # so no `sample_indices` will be returned.
+
+ # Scenario 2: No `sample_query` nor `sample_indices` were given,
+ # but there is an internal `sample_query`,
+ # which will be converted into `sample_indices` and returned.
+
+ # Scenario 3: Only `sample_query` has been provided,
+ # which will be converted into `sample_indices` and returned.
+ # This will be handled the same as Scenario 2.
+
+ # Scenario 4: Only `sample_indices` has been provided,
+ # and there is no internal `sample_query`,
+ # simply return `sample_indices`.
+
+ # Scenario 5: Only `sample_indices` has been provided,
+ # but there is also an internal `sample_query`, still return `sample_indices`,
+ # which ought to already align with `sample_metadata`.
+
# Normalise sample sets.
prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
@@ -1068,20 +1098,46 @@ def _prep_sample_selection_cache_params(
del sample_sets
del sample_query
- if prepared_sample_query is not None:
- # Resolve query to a list of integers for more cache hits - we
- # do this because there are different ways to write the same pandas
- # query, and so it's better to evaluate the query and use a list of
- # integer indices instead.
+ # Start with assuming there are no sample indices.
+ # This can be returned if there is no `prepared_sample_query` nor `sample_indices`.
+ prepared_sample_indices = None
+
+ # If there is a `prepared_sample_query` but no `sample_indices`...
+ if prepared_sample_query is not None and sample_indices is None:
+ # Get the unfiltered sample metadata for the given sample sets.
+ # Note: we don't want to pass the `sample_query` to `sample_metadata` here
+ # because we want to get the sample indices that represent the `sample_query`.
df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)
+
+ # Default the sample_query_options to an empty dict.
sample_query_options = sample_query_options or {}
+
# Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
+ # Get the Pandas Series as a NumPy array of Boolean values.
+ # Note: if `prepared_sample_query` is an internal query, this will select all samples,
+ # since `sample_metadata` should have already applied the internal query.
loc_samples = df_samples.eval(
prepared_sample_query, **sample_query_options, engine="python"
).values
- sample_indices = np.nonzero(loc_samples)[0].tolist()
- return prepared_sample_sets, sample_indices
+ # Convert the sample indices to a list.
+ # Get the indices of the True values in the Boolean array and convert it to a list of integers.
+ prepared_sample_indices = np.nonzero(loc_samples)[0].tolist()
+
+ # If there is a `prepared_sample_query` and a `sample_indices`...
+ elif prepared_sample_query is not None and sample_indices is not None:
+ # Given that we don't allow both `sample_query` and `sample_indices` params in this function,
+ # we can deduce that the `prepared_sample_query` has resulted from an internal query.
+ # Given that `sample_indices` should be aligned with the results of `sample_metadata`,
+ # which should already apply the internal query, simply return the given `sample_indices`.
+
+ prepared_sample_indices = sample_indices
+
+ # If there is no `prepared_sample_query` but there is a `sample_indices`...
+ elif prepared_sample_query is None and sample_indices is not None:
+ prepared_sample_indices = sample_indices
+
+ return prepared_sample_sets, prepared_sample_indices
def _results_cache_add_analysis_params(self, params: dict):
super()._results_cache_add_analysis_params(params)
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
index b4e35363f..2c1f0fc46 100644
--- a/malariagen_data/anoph/snp_data.py
+++ b/malariagen_data/anoph/snp_data.py
@@ -753,7 +753,7 @@ def _locate_site_class(
try:
loc_ann = self._cache_locate_site_class[cache_key]
- except KeyError:
+ except KeyError as exc:
# Access site annotations data.
ds_ann = self._site_annotations_raw(
contig=region.contig,
@@ -877,7 +877,7 @@ def _locate_site_class(
) | ((seq_cls == SEQ_CLS_DOWNSTREAM) & (seq_relpos_start > 10_000))
else:
- raise NotImplementedError(site_class)
+ raise NotImplementedError(site_class) from exc
# N.B., site annotations data are provided for every position in the genome. We need to
# therefore subset to SNP positions.
@@ -1007,20 +1007,21 @@ def snp_calls(
)
# Normalise parameters.
- prepared_regions: Tuple[Region, ...] = tuple(parse_multi_region(self, region))
- prepared_sample_sets: Tuple[str, ...] = tuple(
- self._prep_sample_sets_param(sample_sets=sample_sets)
- )
-
- sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
-
- if sample_indices is not None:
- prepared_sample_indices: Optional[Tuple[int, ...]] = tuple(sample_indices)
- else:
- prepared_sample_indices = sample_indices
-
+ prepared_regions = parse_multi_region(self, region)
prepared_site_mask = self._prep_optional_site_mask_param(site_mask=site_mask)
+ # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
+ # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
+ (
+ prepared_sample_sets,
+ prepared_sample_indices,
+ ) = self._prep_sample_selection_cache_params(
+ sample_sets=sample_sets,
+ sample_query=sample_query,
+ sample_query_options=sample_query_options,
+ sample_indices=sample_indices,
+ )
+
# Delete original parameters to prevent accidental use.
del sample_sets
del sample_query
@@ -1028,12 +1029,22 @@ def snp_calls(
del region
del site_mask
+ # Convert lists to tuples to avoid CacheMiss "TypeError: unhashable type: 'list'".
+ prepared_regions_tuple: Tuple[Region, ...] = tuple(prepared_regions)
+ prepared_sample_sets_tuple: Optional[Tuple[str, ...]] = (
+ tuple(prepared_sample_sets) if prepared_sample_sets is not None else None
+ )
+ prepared_sample_indices_tuple: Optional[Tuple[int, ...]] = (
+ tuple(prepared_sample_indices)
+ if prepared_sample_indices is not None
+ else None
+ )
+
+ # Note: `_snp_calls` should only take `sample_indices`, not `sample_query`, to facilitate caching.
return self._snp_calls(
- regions=prepared_regions,
- sample_sets=prepared_sample_sets,
- sample_query=sample_query_prepped,
- sample_query_options=sample_query_options,
- sample_indices=prepared_sample_indices,
+ regions=prepared_regions_tuple,
+ sample_sets=prepared_sample_sets_tuple,
+ sample_indices=prepared_sample_indices_tuple,
site_mask=prepared_site_mask,
site_class=site_class,
cohort_size=cohort_size,
@@ -1127,8 +1138,6 @@ def _snp_calls(
*,
regions: Tuple[Region, ...],
sample_sets,
- sample_query,
- sample_query_options,
sample_indices,
site_mask,
site_class,
@@ -1139,10 +1148,15 @@ def _snp_calls(
inline_array,
chunks,
):
- # Note: sample_sets and sample_query should be "prepared" before being passed to this private function.
+ ## Get SNP calls and concatenate multiple sample sets and/or regions.
+
+ # Note: sample_sets should be "prepared" before being passed to this private function.
+
+ # Note: `_snp_calls` should only take `sample_indices`, not `sample_query`.
+ # Use `_prep_sample_selection_cache_params` to convert `sample_query` to `sample_indices`.
+
+ # Note: we don't cache different sample_indices subsets, which are selected below.
- # Get SNP calls and concatenate multiple sample sets and/or regions.
- # Note: we don't cache different sample_query or sample_indices subsets.
ds = self._cached_snp_calls(
regions=regions,
sample_sets=sample_sets,
@@ -1153,22 +1167,41 @@ def _snp_calls(
)
# Handle sample selection.
- if sample_query is not None:
+ if sample_indices is not None:
+ # Note: `sample_indices` could be any tuple of integers, while the `ds` DataSet will contain data for all samples in the `sample_sets`.
+ # In other words, the internal `sample_query` is not being applied to `ds`.
+ # We need to get the filtered set of samples from `sample_metadata` and then select samples based on that set.
+
# Get the relevant sample metadata.
- df_samples = self.sample_metadata(sample_sets=sample_sets)
+ relevant_samples_df = self.sample_metadata(sample_sets=sample_sets)
- # If there are no sample query options, then default to an empty dict.
- sample_query_options = sample_query_options or {}
+ # We need to select only the samples that are identified by the `sample_indices` tuple relative to the results of `sample_metadata`.
+ # However, the `ds` DataSet contains data for all samples in the `sample_sets`, regardless of any internal `sample_query`.
- ds = self._filter_sample_dataset(
- ds=ds,
- df_samples=df_samples,
- sample_query=sample_query,
- sample_query_options=sample_query_options,
- )
+ # Get the samples identified via `sample_indices`.
+ # Note: this might raise `IndexingError` if the user provides bad indices, e.g. "positional indexers are out-of-bounds".
+ # Note: `sample_indices` needs to be a list rather than tuple for `iloc`, otherwise `IndexingError`, e.g. "Too many indexers".
+ sample_indices_as_list = list(sample_indices)
+ selected_samples_df = relevant_samples_df.iloc[sample_indices_as_list]
- elif sample_indices is not None:
- ds = ds.isel(samples=list(sample_indices))
+ # Get the selected sample ids from the sample metadata DataFrame.
+ relevant_sample_ids = selected_samples_df["sample_id"].values
+
+ # Get all the sample ids from the unfiltered Dataset.
+ ds_sample_ids = ds.coords["sample_id"].values
+
+ # Get the indices of samples in the Dataset that match the relevant sample ids.
+ # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
+ relevant_sample_indices = np.where(
+ np.isin(ds_sample_ids, relevant_sample_ids)
+ )[0]
+
+ # Preserve the behaviour of raising a `ValueError` instead of empty results.
+ if relevant_sample_indices.size == 0:
+ raise ValueError("No relevant samples found.")
+
+ # Select only the relevant samples from the Dataset.
+ ds = ds.isel(samples=relevant_sample_indices)
# Handle cohort size, overrides min and max.
if cohort_size is not None:
@@ -1939,7 +1972,7 @@ def _biallelic_diplotypes(
inline_array,
chunks,
):
- # Note: this uses sample_indices and should not expect a sample_query.
+ # Note: this function uses sample_indices and should not expect a sample_query.
# Access biallelic SNPs.
ds = self.biallelic_snp_calls(
From 4b9904f073ebe16b056b2f842feb5f198920e102 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Tue, 17 Jun 2025 10:21:52 +0100
Subject: [PATCH 32/32] Amend snp_genotypes to handle sample_indices when
surveillance_use_only
---
malariagen_data/anoph/snp_data.py | 71 ++++++++++++++++++++++---------
1 file changed, 51 insertions(+), 20 deletions(-)
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
index 2c1f0fc46..98b159012 100644
--- a/malariagen_data/anoph/snp_data.py
+++ b/malariagen_data/anoph/snp_data.py
@@ -461,24 +461,26 @@ def snp_genotypes(
sample_query=sample_query, sample_indices=sample_indices
)
- # Normalise parameters.
- sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
+ # Prepare parameters.
+ prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
+ prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
+ prepared_regions: List[Region] = parse_multi_region(self, region)
+ prepared_site_mask = self._prep_optional_site_mask_param(site_mask=site_mask)
+
+ # Delete original parameters to prevent accidental use.
del sample_sets
- sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
del sample_query
- regions: List[Region] = parse_multi_region(self, region)
del region
- site_mask_prepped = self._prep_optional_site_mask_param(site_mask=site_mask)
del site_mask
with self._spinner("Access SNP genotypes"):
# Concatenate multiple sample sets and/or contigs.
lx = []
- for r in regions:
+ for r in prepared_regions:
contig = r.contig
ly = []
- for s in sample_sets_prepped:
+ for s in prepared_sample_sets:
y = self._snp_genotypes_for_contig(
contig=contig,
sample_set=s,
@@ -508,24 +510,53 @@ def snp_genotypes(
d = da_concat(lx, axis=0)
# Apply site filters if requested.
- if site_mask_prepped is not None:
+ if prepared_site_mask is not None:
loc_sites = self.site_filters(
- region=regions,
- mask=site_mask_prepped,
+ region=prepared_regions,
+ mask=prepared_site_mask,
)
d = da_compress(loc_sites, d, axis=0)
- # Apply sample selection if requested.
- if sample_query_prepped is not None:
- df_samples = self.sample_metadata(sample_sets=sample_sets_prepped)
- sample_query_options = sample_query_options or {}
- loc_samples = df_samples.eval(
- sample_query_prepped, **sample_query_options
- ).values
- if np.count_nonzero(loc_samples) == 0:
- raise ValueError(f"No samples found for query {sample_query_prepped!r}")
+ # Apply the sample_query, if there is one.
+ # Note: this might have been internally modified, e.g. `is_surveillance == True`.
+ if prepared_sample_query is not None:
+ # Note: the unfiltered Dask array `d` is not aligned with the filtered `sample_metadata`,
+ # so we cannot use filtered `sample_metadata` to get the relevant boolean filter.
+
+ # Note: the unfiltered Dask array `d` does not contain sample identifiers,
+ # so we cannot use a list of relevant sample ids to produce the boolean filter directly.
+
+ # Note: we can first determine the list of relevant sample ids using filtered `sample_metadata`,
+ # then use the unfiltered `general_metadata` to determine the appropriate boolean filter.
+
+ df_filtered_samples = self.sample_metadata(
+ sample_sets=prepared_sample_sets,
+ sample_query=prepared_sample_query,
+ sample_query_options=sample_query_options,
+ )
+
+ # Raise an error if no samples match the sample query.
+ if len(df_filtered_samples) == 0:
+ raise ValueError(
+ f"No samples found for query {prepared_sample_query!r}"
+ )
+
+ # Get the list of unfiltered samples, in order to produce an aligned boolean filter.
+ df_unfiltered_samples = self.general_metadata(
+ sample_sets=prepared_sample_sets
+ )
+
+ # Get a boolean array for unfiltered data, indicating which samples match the query.
+ loc_samples = df_unfiltered_samples["sample_id"].isin(
+ df_filtered_samples["sample_id"]
+ )
+
+ # Filter the Dask array using the boolean array.
d = da.compress(loc_samples, d, axis=1)
- elif sample_indices is not None:
+
+ # Apply the sample_indices, if there are any.
+ # Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`.
+ if sample_indices is not None:
d = da.take(d, sample_indices, axis=1)
return d