Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions admixture_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# import itertools
import random
import pytest
from pytest_cases import parametrize_with_cases
import numpy as np
# import bokeh.models
# import pandas as pd
# import plotly.graph_objects as go

from malariagen_data import af1 as _af1
from malariagen_data import ag3 as _ag3
from malariagen_data.anoph.admixture_stats import AnophelesAdmixtureAnalysis


@pytest.fixture
def ag3_sim_api(ag3_sim_fixture):
return AnophelesAdmixtureAnalysis(
url=ag3_sim_fixture.url,
config_path=_ag3.CONFIG_PATH,
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
major_version_path=_ag3.MAJOR_VERSION_PATH,
pre=True,
aim_metadata_dtype={
"aim_species_fraction_arab": "float64",
"aim_species_fraction_colu": "float64",
"aim_species_fraction_colu_no2l": "float64",
"aim_species_gambcolu_arabiensis": object,
"aim_species_gambiae_coluzzii": object,
"aim_species": object,
},
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
default_site_mask="gamb_colu_arab",
results_cache=ag3_sim_fixture.results_cache_path.as_posix(),
taxon_colors=_ag3.TAXON_COLORS,
virtual_contigs=_ag3.VIRTUAL_CONTIGS,
)


@pytest.fixture
def af1_sim_api(af1_sim_fixture):
return AnophelesAdmixtureAnalysis(
url=af1_sim_fixture.url,
config_path=_af1.CONFIG_PATH,
major_version_number=_af1.MAJOR_VERSION_NUMBER,
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
default_site_mask="funestus",
results_cache=af1_sim_fixture.results_cache_path.as_posix(),
taxon_colors=_af1.TAXON_COLORS,
)


# N.B., here we use pytest_cases to parametrize tests. Each
# function whose name begins with "case_" defines a set of
# inputs to the test functions. See the documentation for
# pytest_cases for more information, e.g.:
#
# https://smarie.github.io/python-pytest-cases/#basic-usage
#
# We use this approach here because we want to use fixtures
# as test parameters, which is otherwise hard to do with
# pytest alone.


def case_ag3_sim(ag3_sim_fixture, ag3_sim_api):
return ag3_sim_fixture, ag3_sim_api


def case_af1_sim(af1_sim_fixture, af1_sim_api):
return af1_sim_fixture, af1_sim_api


@parametrize_with_cases("fixture,api", cases=".")
def test_patterson_f3(fixture, api: AnophelesAdmixtureAnalysis):
# Set up test parameters.
all_taxon = api.sample_metadata()["taxon"].to_list()
taxon = random.sample(all_taxon, 3)
recipient_query = f"taxon == {taxon[0]!r}"
source1_query = f"taxon == {taxon[1]!r}"
source2_query = f"taxon == {taxon[2]!r}"
admixture_params = dict(
region=random.choice(api.contigs),
recipient_query=recipient_query,
source1_query=source1_query,
source2_query=source2_query,
site_mask=random.choice(api.site_mask_ids),
segregating_mode=random.choice(["recipient", "all"]),
n_jack=random.randint(10, 200),
min_cohort_size=1,
max_cohort_size=50,
)

# Run patterson f3 function under test.
f3, se, z = api.patterson_f3(**admixture_params)

# Check results.
assert isinstance(f3, np.float64)
assert isinstance(se, np.float64)
assert isinstance(z, np.float64)
assert -0.1 <= f3 <= 1
assert 0 <= se <= 1
# assert -50 <= z <= 50 The sample is taken from across all data and it seems this value can be very large.
273 changes: 273 additions & 0 deletions malariagen_data/anoph/admixture_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
from typing import Tuple, Optional

import allel
import numpy as np
import numba

from . import base_params
from . import admixture_stats_params
from .snp_data import AnophelesSampleMetadata, AnophelesSnpData


@numba.njit
def _remap_observed(ac_full):
"""Remap allele indices to remove unobserved alleles."""
n_variants = ac_full.shape[0]
n_alleles = ac_full.shape[1]
# Create the output array - this is an allele mapping array,
# that specifies how to recode allele indices.
mapping = np.empty((n_variants, n_alleles), dtype=np.int32)
mapping[:] = -1
# Iterate over variants.
for i in range(n_variants):
# This will be the new index that we are mapping this allele to, if the
# allele count is not zero.
j_out = 0
# Iterate over columns (alleles) in the input array.
for j in range(n_alleles):
# Access the count for the jth allele.
c = ac_full[i, j]
if c > 0:
# We have found a non-zero allele count, remap the allele.
mapping[i, j] = j_out
j_out += 1
return mapping


class AnophelesAdmixtureAnalysis(
AnophelesSnpData,
AnophelesSampleMetadata,
):
def __init__(
self,
**kwargs,
):
# N.B., this class is designed to work cooperatively, and
# so it's important that any remaining parameters are passed
# to the superclass constructor.
super().__init__(**kwargs)

def patterson_f3(
self,
recipient_query: base_params.sample_query,
source1_query: base_params.sample_query,
source2_query: base_params.sample_query,
region: base_params.region,
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
n_jack: base_params.n_jack = 200,
max_missing_an: base_params.max_missing_an = 0,
segregating_mode: admixture_stats_params.segregating_mode = "recipient",
cohort_size: Optional[
base_params.cohort_size
] = admixture_stats_params.cohort_size_default,
min_cohort_size: Optional[
base_params.min_cohort_size
] = admixture_stats_params.min_cohort_size_default,
max_cohort_size: Optional[
base_params.max_cohort_size
] = admixture_stats_params.max_cohort_size_default,
) -> Tuple[float, float, float]:
# Purely for conciseness, internally here we use the scikit-allel convention
# of labelling the recipient population "C" and the source populations as
# "A" and "B".

# Compute allele counts for the three cohorts.
acc = self.snp_allele_counts(
sample_query=recipient_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)
aca = self.snp_allele_counts(
sample_query=source1_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)
acb = self.snp_allele_counts(
sample_query=source2_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)

# Locate biallelic variants and deal with missingness.
ac = acc + aca + acb
loc_bi = allel.AlleleCountsArray(ac).is_biallelic()
anc = np.sum(acc, axis=1)
ana = np.sum(aca, axis=1)
anb = np.sum(acb, axis=1)
an = np.sum(ac, axis=1) # no. required for sample size
n_chroms = an.max()
an_missing = n_chroms - an
# In addition to applying the max_missing_an threshold, also make sure that
# all three cohorts have nonzero allele counts.
loc_nomiss = (an_missing <= max_missing_an) & (anc > 0) & (ana > 0) & (anb > 0)
loc_sites = loc_bi & loc_nomiss
acc_bi = acc[loc_sites]
aca_bi = aca[loc_sites]
acb_bi = acb[loc_sites]
ac_bi = ac[loc_sites]

# Squeeze the allele counts so we end up with only two columns.
sqz_mapping = _remap_observed(ac_bi)
acc_sqz = allel.AlleleCountsArray(acc_bi).map_alleles(sqz_mapping, max_allele=1)
aca_sqz = allel.AlleleCountsArray(aca_bi).map_alleles(sqz_mapping, max_allele=1)
acb_sqz = allel.AlleleCountsArray(acb_bi).map_alleles(sqz_mapping, max_allele=1)

# Keep only segregating variants.
if segregating_mode == "recipient":
loc_seg = acc_sqz.is_segregating()
elif segregating_mode == "all":
loc_seg = (
acc_sqz.is_segregating()
& aca_sqz.is_segregating()
& acb_sqz.is_segregating()
)
else:
raise ValueError("Invalid value for 'segregating_mode' parameter.")
if loc_seg.sum() == 0:
raise ValueError("No segregating variants found")
else:
print(f"Using {loc_seg.sum()} segregating variants for F3 calculation.")
acc_seg = acc_sqz[loc_seg]
aca_seg = aca_sqz[loc_seg]
acb_seg = acb_sqz[loc_seg]

# Compute f3 statistic.
blen = acc_seg.shape[0] // n_jack
f3, se, z, _, _ = allel.average_patterson_f3(
acc_seg,
aca_seg,
acb_seg,
blen=blen,
normed=True,
)

return f3, se, z

def patterson_f4(
self,
a_query: base_params.sample_query,
b_query: base_params.sample_query,
c_query: base_params.sample_query,
d_query: base_params.sample_query,
region: base_params.region,
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
n_jack: base_params.n_jack = 200,
max_missing_an: base_params.max_missing_an = 0,
segregating_mode: admixture_stats_params.segregating_mode = "recipient",
cohort_size: Optional[
base_params.cohort_size
] = admixture_stats_params.cohort_size_default,
min_cohort_size: Optional[
base_params.min_cohort_size
] = admixture_stats_params.min_cohort_size_default,
max_cohort_size: Optional[
base_params.max_cohort_size
] = admixture_stats_params.max_cohort_size_default,
) -> Tuple[float, float, float]:
# Purely for conciseness, internally here we use the scikit-allel convention
# of labelling the recipient population "C" and the source populations as
# "A" and "B".

# Compute allele counts for the three cohorts.
aca = self.snp_allele_counts(
sample_query=a_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)

acb = self.snp_allele_counts(
sample_query=b_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)

acc = self.snp_allele_counts(
sample_query=c_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)

acd = self.snp_allele_counts(
sample_query=d_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)

# Locate biallelic variants and deal with missingness.
ac = acc + aca + acb
loc_bi = allel.AlleleCountsArray(ac).is_biallelic()
ana = np.sum(acc, axis=1)
anb = np.sum(aca, axis=1)
anc = np.sum(acb, axis=1)
an = np.sum(ac, axis=1) # no. required for sample size
n_chroms = an.max()
an_missing = n_chroms - an
# In addition to applying the max_missing_an threshold, also make sure that
# all three cohorts have nonzero allele counts.
loc_nomiss = (an_missing <= max_missing_an) & (anc > 0) & (ana > 0) & (anb > 0)
loc_sites = loc_bi & loc_nomiss
aca_bi = acc[loc_sites]
acb_bi = aca[loc_sites]
acc_bi = acb[loc_sites]
acd_bi = acd[loc_sites]
ac_bi = aca_bi + acb_bi + acc_bi + acd_bi

# Squeeze the allele counts so we end up with only two columns.
sqz_mapping = _remap_observed(ac_bi)
aca_sqz = allel.AlleleCountsArray(acc_bi).map_alleles(sqz_mapping, max_allele=1)
acb_sqz = allel.AlleleCountsArray(aca_bi).map_alleles(sqz_mapping, max_allele=1)
acc_sqz = allel.AlleleCountsArray(acb_bi).map_alleles(sqz_mapping, max_allele=1)
acd_sqz = allel.AlleleCountsArray(acd_bi).map_alleles(sqz_mapping, max_allele=1)

# Keep only segregating variants.
if segregating_mode == "recipient":
loc_seg = acc_sqz.is_segregating()
elif segregating_mode == "all":
loc_seg = (
aca_sqz.is_segregating()
& acb_sqz.is_segregating()
& acc_sqz.is_segregating()
)
else:
raise ValueError("Invalid value for 'segregating_mode' parameter.")
if loc_seg.sum() == 0:
raise ValueError("No segregating variants found")
else:
print(f"Using {loc_seg.sum()} segregating variants for F3 calculation.")
aca_seg = acc_sqz[loc_seg]
acb_seg = aca_sqz[loc_seg]
acc_seg = acb_sqz[loc_seg]
acd_seg = acd_sqz[loc_seg]

# Compute f3 statistic.
blen = acc_seg.shape[0] // n_jack
f4, se, z, _, _ = allel.average_patterson_d(
aca_seg,
acb_seg,
acc_seg,
acd_seg,
blen=blen,
)

return f4, se, z
Loading
Loading