malariagen · KellyLBennett · Mar 17, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 21, 2025
diff --git a/admixture_tests.py b/admixture_tests.py
@@ -0,0 +1,107 @@
+# import itertools
+import random
+import pytest
+from pytest_cases import parametrize_with_cases
+import numpy as np
+# import bokeh.models
+# import pandas as pd
+# import plotly.graph_objects as go
+
+from malariagen_data import af1 as _af1
+from malariagen_data import ag3 as _ag3
+from malariagen_data.anoph.admixture_stats import AnophelesAdmixtureAnalysis
+
+
+@pytest.fixture
+def ag3_sim_api(ag3_sim_fixture):
+    return AnophelesAdmixtureAnalysis(
+        url=ag3_sim_fixture.url,
+        config_path=_ag3.CONFIG_PATH,
+        major_version_number=_ag3.MAJOR_VERSION_NUMBER,
+        major_version_path=_ag3.MAJOR_VERSION_PATH,
+        pre=True,
+        aim_metadata_dtype={
+            "aim_species_fraction_arab": "float64",
+            "aim_species_fraction_colu": "float64",
+            "aim_species_fraction_colu_no2l": "float64",
+            "aim_species_gambcolu_arabiensis": object,
+            "aim_species_gambiae_coluzzii": object,
+            "aim_species": object,
+        },
+        gff_gene_type="gene",
+        gff_gene_name_attribute="Name",
+        gff_default_attributes=("ID", "Parent", "Name", "description"),
+        default_site_mask="gamb_colu_arab",
+        results_cache=ag3_sim_fixture.results_cache_path.as_posix(),
+        taxon_colors=_ag3.TAXON_COLORS,
+        virtual_contigs=_ag3.VIRTUAL_CONTIGS,
+    )
+
+
+@pytest.fixture
+def af1_sim_api(af1_sim_fixture):
+    return AnophelesAdmixtureAnalysis(
+        url=af1_sim_fixture.url,
+        config_path=_af1.CONFIG_PATH,
+        major_version_number=_af1.MAJOR_VERSION_NUMBER,
+        major_version_path=_af1.MAJOR_VERSION_PATH,
+        pre=False,
+        gff_gene_type="protein_coding_gene",
+        gff_gene_name_attribute="Note",
+        gff_default_attributes=("ID", "Parent", "Note", "description"),
+        default_site_mask="funestus",
+        results_cache=af1_sim_fixture.results_cache_path.as_posix(),
+        taxon_colors=_af1.TAXON_COLORS,
+    )
+
+
+# N.B., here we use pytest_cases to parametrize tests. Each
+# function whose name begins with "case_" defines a set of
+# inputs to the test functions. See the documentation for
+# pytest_cases for more information, e.g.:
+#
+# https://smarie.github.io/python-pytest-cases/#basic-usage
+#
+# We use this approach here because we want to use fixtures
+# as test parameters, which is otherwise hard to do with
+# pytest alone.
+
+
+def case_ag3_sim(ag3_sim_fixture, ag3_sim_api):
+    return ag3_sim_fixture, ag3_sim_api
+
+
+def case_af1_sim(af1_sim_fixture, af1_sim_api):
+    return af1_sim_fixture, af1_sim_api
+
+
+@parametrize_with_cases("fixture,api", cases=".")
+def test_patterson_f3(fixture, api: AnophelesAdmixtureAnalysis):
+    # Set up test parameters.
+    all_taxon = api.sample_metadata()["taxon"].to_list()
+    taxon = random.sample(all_taxon, 3)
+    recipient_query = f"taxon == {taxon[0]!r}"
+    source1_query = f"taxon == {taxon[1]!r}"
+    source2_query = f"taxon == {taxon[2]!r}"
+    admixture_params = dict(
+        region=random.choice(api.contigs),
+        recipient_query=recipient_query,
+        source1_query=source1_query,
+        source2_query=source2_query,
+        site_mask=random.choice(api.site_mask_ids),
+        segregating_mode=random.choice(["recipient", "all"]),
+        n_jack=random.randint(10, 200),
+        min_cohort_size=1,
+        max_cohort_size=50,
+    )
+
+    # Run patterson f3 function under test.
+    f3, se, z = api.patterson_f3(**admixture_params)
+
+    # Check results.
+    assert isinstance(f3, np.float64)
+    assert isinstance(se, np.float64)
+    assert isinstance(z, np.float64)
+    assert -0.1 <= f3 <= 1
+    assert 0 <= se <= 1
+    # assert -50 <= z <= 50 The sample is taken from across all data and it seems this value can be very large.
diff --git a/malariagen_data/anoph/admixture_stats.py b/malariagen_data/anoph/admixture_stats.py
@@ -0,0 +1,273 @@
+from typing import Tuple, Optional
+
+import allel
+import numpy as np
+import numba
+
+from . import base_params
+from . import admixture_stats_params
+from .snp_data import AnophelesSampleMetadata, AnophelesSnpData
+
+
+@numba.njit
+def _remap_observed(ac_full):
+    """Remap allele indices to remove unobserved alleles."""
+    n_variants = ac_full.shape[0]
+    n_alleles = ac_full.shape[1]
+    # Create the output array - this is an allele mapping array,
+    # that specifies how to recode allele indices.
+    mapping = np.empty((n_variants, n_alleles), dtype=np.int32)
+    mapping[:] = -1
+    # Iterate over variants.
+    for i in range(n_variants):
+        # This will be the new index that we are mapping this allele to, if the
+        # allele count is not zero.
+        j_out = 0
+        # Iterate over columns (alleles) in the input array.
+        for j in range(n_alleles):
+            # Access the count for the jth allele.
+            c = ac_full[i, j]
+            if c > 0:
+                # We have found a non-zero allele count, remap the allele.
+                mapping[i, j] = j_out
+                j_out += 1
+    return mapping
+
+
+class AnophelesAdmixtureAnalysis(
+    AnophelesSnpData,
+    AnophelesSampleMetadata,
+):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        # N.B., this class is designed to work cooperatively, and
+        # so it's important that any remaining parameters are passed
+        # to the superclass constructor.
+        super().__init__(**kwargs)
+
+    def patterson_f3(
+        self,
+        recipient_query: base_params.sample_query,
+        source1_query: base_params.sample_query,
+        source2_query: base_params.sample_query,
+        region: base_params.region,
+        site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
+        n_jack: base_params.n_jack = 200,
+        max_missing_an: base_params.max_missing_an = 0,
+        segregating_mode: admixture_stats_params.segregating_mode = "recipient",
+        cohort_size: Optional[
+            base_params.cohort_size
+        ] = admixture_stats_params.cohort_size_default,
+        min_cohort_size: Optional[
+            base_params.min_cohort_size
+        ] = admixture_stats_params.min_cohort_size_default,
+        max_cohort_size: Optional[
+            base_params.max_cohort_size
+        ] = admixture_stats_params.max_cohort_size_default,
+    ) -> Tuple[float, float, float]:
+        # Purely for conciseness, internally here we use the scikit-allel convention
+        # of labelling the recipient population "C" and the source populations as
+        # "A" and "B".
+
+        # Compute allele counts for the three cohorts.
+        acc = self.snp_allele_counts(
+            sample_query=recipient_query,
+            region=region,
+            site_mask=site_mask,
+            cohort_size=cohort_size,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+        )
+        aca = self.snp_allele_counts(
+            sample_query=source1_query,
+            region=region,
+            site_mask=site_mask,
+            cohort_size=cohort_size,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+        )
+        acb = self.snp_allele_counts(
+            sample_query=source2_query,
+            region=region,
+            site_mask=site_mask,
+            cohort_size=cohort_size,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+        )
+
+        # Locate biallelic variants and deal with missingness.
+        ac = acc + aca + acb
+        loc_bi = allel.AlleleCountsArray(ac).is_biallelic()
+        anc = np.sum(acc, axis=1)
+        ana = np.sum(aca, axis=1)
+        anb = np.sum(acb, axis=1)
+        an = np.sum(ac, axis=1)  # no. required for sample size
+        n_chroms = an.max()
+        an_missing = n_chroms - an
+        # In addition to applying the max_missing_an threshold, also make sure that
+        # all three cohorts have nonzero allele counts.
+        loc_nomiss = (an_missing <= max_missing_an) & (anc > 0) & (ana > 0) & (anb > 0)
+        loc_sites = loc_bi & loc_nomiss
+        acc_bi = acc[loc_sites]
+        aca_bi = aca[loc_sites]
+        acb_bi = acb[loc_sites]
+        ac_bi = ac[loc_sites]
+
+        # Squeeze the allele counts so we end up with only two columns.
+        sqz_mapping = _remap_observed(ac_bi)
+        acc_sqz = allel.AlleleCountsArray(acc_bi).map_alleles(sqz_mapping, max_allele=1)
+        aca_sqz = allel.AlleleCountsArray(aca_bi).map_alleles(sqz_mapping, max_allele=1)
+        acb_sqz = allel.AlleleCountsArray(acb_bi).map_alleles(sqz_mapping, max_allele=1)
+
+        # Keep only segregating variants.
+        if segregating_mode == "recipient":
+            loc_seg = acc_sqz.is_segregating()
+        elif segregating_mode == "all":
+            loc_seg = (
+                acc_sqz.is_segregating()
+                & aca_sqz.is_segregating()
+                & acb_sqz.is_segregating()
+            )
+        else:
+            raise ValueError("Invalid value for 'segregating_mode' parameter.")
+        if loc_seg.sum() == 0:
+            raise ValueError("No segregating variants found")
+        else:
+            print(f"Using {loc_seg.sum()} segregating variants for F3 calculation.")
+        acc_seg = acc_sqz[loc_seg]
+        aca_seg = aca_sqz[loc_seg]
+        acb_seg = acb_sqz[loc_seg]
+
+        # Compute f3 statistic.
+        blen = acc_seg.shape[0] // n_jack
+        f3, se, z, _, _ = allel.average_patterson_f3(
+            acc_seg,
+            aca_seg,
+            acb_seg,
+            blen=blen,
+            normed=True,
+        )
+
+        return f3, se, z
+
+    def patterson_f4(
+        self,
+        a_query: base_params.sample_query,
+        b_query: base_params.sample_query,
+        c_query: base_params.sample_query,
+        d_query: base_params.sample_query,
+        region: base_params.region,
+        site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
+        n_jack: base_params.n_jack = 200,
+        max_missing_an: base_params.max_missing_an = 0,
+        segregating_mode: admixture_stats_params.segregating_mode = "recipient",
+        cohort_size: Optional[
+            base_params.cohort_size
+        ] = admixture_stats_params.cohort_size_default,
+        min_cohort_size: Optional[
+            base_params.min_cohort_size
+        ] = admixture_stats_params.min_cohort_size_default,
+        max_cohort_size: Optional[
+            base_params.max_cohort_size
+        ] = admixture_stats_params.max_cohort_size_default,
+    ) -> Tuple[float, float, float]:
+        # Purely for conciseness, internally here we use the scikit-allel convention
+        # of labelling the recipient population "C" and the source populations as
+        # "A" and "B".
+
+        # Compute allele counts for the three cohorts.
+        aca = self.snp_allele_counts(
+            sample_query=a_query,
+            region=region,
+            site_mask=site_mask,
+            cohort_size=cohort_size,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+        )
+
+        acb = self.snp_allele_counts(
+            sample_query=b_query,
+            region=region,
+            site_mask=site_mask,
+            cohort_size=cohort_size,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+        )
+
+        acc = self.snp_allele_counts(
+            sample_query=c_query,
+            region=region,
+            site_mask=site_mask,
+            cohort_size=cohort_size,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+        )
+
+        acd = self.snp_allele_counts(
+            sample_query=d_query,
+            region=region,
+            site_mask=site_mask,
+            cohort_size=cohort_size,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+        )
+
+        # Locate biallelic variants and deal with missingness.
+        ac = acc + aca + acb
+        loc_bi = allel.AlleleCountsArray(ac).is_biallelic()
+        ana = np.sum(acc, axis=1)
+        anb = np.sum(aca, axis=1)
+        anc = np.sum(acb, axis=1)
+        an = np.sum(ac, axis=1)  # no. required for sample size
+        n_chroms = an.max()
+        an_missing = n_chroms - an
+        # In addition to applying the max_missing_an threshold, also make sure that
+        # all three cohorts have nonzero allele counts.
+        loc_nomiss = (an_missing <= max_missing_an) & (anc > 0) & (ana > 0) & (anb > 0)
+        loc_sites = loc_bi & loc_nomiss
+        aca_bi = acc[loc_sites]
+        acb_bi = aca[loc_sites]
+        acc_bi = acb[loc_sites]
+        acd_bi = acd[loc_sites]
+        ac_bi = aca_bi + acb_bi + acc_bi + acd_bi
+
+        # Squeeze the allele counts so we end up with only two columns.
+        sqz_mapping = _remap_observed(ac_bi)
+        aca_sqz = allel.AlleleCountsArray(acc_bi).map_alleles(sqz_mapping, max_allele=1)
+        acb_sqz = allel.AlleleCountsArray(aca_bi).map_alleles(sqz_mapping, max_allele=1)
+        acc_sqz = allel.AlleleCountsArray(acb_bi).map_alleles(sqz_mapping, max_allele=1)
+        acd_sqz = allel.AlleleCountsArray(acd_bi).map_alleles(sqz_mapping, max_allele=1)
+
+        # Keep only segregating variants.
+        if segregating_mode == "recipient":
+            loc_seg = acc_sqz.is_segregating()
+        elif segregating_mode == "all":
+            loc_seg = (
+                aca_sqz.is_segregating()
+                & acb_sqz.is_segregating()
+                & acc_sqz.is_segregating()
+            )
+        else:
+            raise ValueError("Invalid value for 'segregating_mode' parameter.")
+        if loc_seg.sum() == 0:
+            raise ValueError("No segregating variants found")
+        else:
+            print(f"Using {loc_seg.sum()} segregating variants for F3 calculation.")
+        aca_seg = acc_sqz[loc_seg]
+        acb_seg = aca_sqz[loc_seg]
+        acc_seg = acb_sqz[loc_seg]
+        acd_seg = acd_sqz[loc_seg]
+
+        # Compute f3 statistic.
+        blen = acc_seg.shape[0] // n_jack
+        f4, se, z, _, _ = allel.average_patterson_d(
+            aca_seg,
+            acb_seg,
+            acc_seg,
+            acd_seg,
+            blen=blen,
+        )
+
+        return f4, se, z