Skip to content

Commit 73b4439

Browse files
committed
Factoring some shared code
1 parent deb5f9a commit 73b4439

File tree

3 files changed

+126
-220
lines changed

3 files changed

+126
-220
lines changed

malariagen_data/anoph/hap_frq.py

Lines changed: 7 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,13 @@
66
import allel
77
from numpydoc_decorator import doc # type: ignore
88

9-
from ..util import check_types, haplotype_frequencies
9+
from ..util import (
10+
check_types,
11+
haplotype_frequencies,
12+
_prep_samples_for_cohort_grouping,
13+
_build_cohorts_from_sample_grouping,
14+
_add_frequency_ci,
15+
)
1016
from .hap_data import AnophelesHapData
1117
from .sample_metadata import locate_cohorts
1218
from . import base_params, frq_params
@@ -269,115 +275,3 @@ def haplotypes_frequencies_advanced(
269275
_add_frequency_ci(ds=ds_out, ci_method=ci_method)
270276

271277
return ds_out
272-
273-
274-
def _prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
275-
# Take a copy, as we will modify the dataframe.
276-
df_samples = df_samples.copy()
277-
278-
# Fix "intermediate" or "unassigned" taxon values - we only want to build
279-
# cohorts with clean taxon calls, so we set other values to None.
280-
loc_intermediate_taxon = (
281-
df_samples["taxon"].str.startswith("intermediate").fillna(False)
282-
)
283-
df_samples.loc[loc_intermediate_taxon, "taxon"] = None
284-
loc_unassigned_taxon = (
285-
df_samples["taxon"].str.startswith("unassigned").fillna(False)
286-
)
287-
df_samples.loc[loc_unassigned_taxon, "taxon"] = None
288-
289-
# Add period column.
290-
if period_by == "year":
291-
make_period = _make_sample_period_year
292-
elif period_by == "quarter":
293-
make_period = _make_sample_period_quarter
294-
elif period_by == "month":
295-
make_period = _make_sample_period_month
296-
else: # pragma: no cover
297-
raise ValueError(
298-
f"Value for period_by parameter must be one of 'year', 'quarter', 'month'; found {period_by!r}."
299-
)
300-
sample_period = df_samples.apply(make_period, axis="columns")
301-
df_samples["period"] = sample_period
302-
303-
# Add area column for consistent output.
304-
df_samples["area"] = df_samples[area_by]
305-
306-
return df_samples
307-
308-
309-
def _build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_size):
310-
# Build cohorts dataframe.
311-
df_cohorts = group_samples_by_cohort.agg(
312-
size=("sample_id", len),
313-
lat_mean=("latitude", "mean"),
314-
lat_max=("latitude", "max"),
315-
lat_min=("latitude", "min"),
316-
lon_mean=("longitude", "mean"),
317-
lon_max=("longitude", "max"),
318-
lon_min=("longitude", "min"),
319-
)
320-
# Reset index so that the index fields are included as columns.
321-
df_cohorts = df_cohorts.reset_index()
322-
323-
# Add cohort helper variables.
324-
cohort_period_start = df_cohorts["period"].apply(lambda v: v.start_time)
325-
cohort_period_end = df_cohorts["period"].apply(lambda v: v.end_time)
326-
df_cohorts["period_start"] = cohort_period_start
327-
df_cohorts["period_end"] = cohort_period_end
328-
# Create a label that is similar to the cohort metadata,
329-
# although this won't be perfect.
330-
df_cohorts["label"] = df_cohorts.apply(
331-
lambda v: f"{v.area}_{v.taxon[:4]}_{v.period}", axis="columns"
332-
)
333-
334-
# Apply minimum cohort size.
335-
df_cohorts = df_cohorts.query(f"size >= {min_cohort_size}").reset_index(drop=True)
336-
337-
# Early check for no cohorts.
338-
if len(df_cohorts) == 0:
339-
raise ValueError(
340-
"No cohorts available for the given sample selection parameters and minimum cohort size."
341-
)
342-
343-
return df_cohorts
344-
345-
346-
def _add_frequency_ci(*, ds, ci_method):
347-
from statsmodels.stats.proportion import proportion_confint # type: ignore
348-
349-
if ci_method is not None:
350-
count = ds["event_count"].values
351-
nobs = ds["event_nobs"].values
352-
with np.errstate(divide="ignore", invalid="ignore"):
353-
frq_ci_low, frq_ci_upp = proportion_confint(
354-
count=count, nobs=nobs, method=ci_method
355-
)
356-
ds["event_frequency_ci_low"] = ("variants", "cohorts"), frq_ci_low
357-
ds["event_frequency_ci_upp"] = ("variants", "cohorts"), frq_ci_upp
358-
359-
360-
def _make_sample_period_month(row):
361-
year = row.year
362-
month = row.month
363-
if year > 0 and month > 0:
364-
return pd.Period(freq="M", year=year, month=month)
365-
else:
366-
return pd.NaT
367-
368-
369-
def _make_sample_period_quarter(row):
370-
year = row.year
371-
month = row.month
372-
if year > 0 and month > 0:
373-
return pd.Period(freq="Q", year=year, month=month)
374-
else:
375-
return pd.NaT
376-
377-
378-
def _make_sample_period_year(row):
379-
year = row.year
380-
if year > 0:
381-
return pd.Period(freq="Y", year=year)
382-
else:
383-
return pd.NaT

malariagen_data/anoph/snp_frq.py

Lines changed: 7 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,13 @@
1111
import plotly.express as px # type: ignore
1212

1313
from .. import veff
14-
from ..util import check_types, pandas_apply
14+
from ..util import (
15+
check_types,
16+
pandas_apply,
17+
_prep_samples_for_cohort_grouping,
18+
_build_cohorts_from_sample_grouping,
19+
_add_frequency_ci,
20+
)
1521
from .snp_data import AnophelesSnpData
1622
from .sample_metadata import locate_cohorts
1723
from . import base_params, frq_params, map_params, plotly_params
@@ -1271,98 +1277,6 @@ def _make_snp_label_aa(aa_change, contig, position, ref_allele, alt_allele):
12711277
return label
12721278

12731279

1274-
def _make_sample_period_month(row):
1275-
year = row.year
1276-
month = row.month
1277-
if year > 0 and month > 0:
1278-
return pd.Period(freq="M", year=year, month=month)
1279-
else:
1280-
return pd.NaT
1281-
1282-
1283-
def _make_sample_period_quarter(row):
1284-
year = row.year
1285-
month = row.month
1286-
if year > 0 and month > 0:
1287-
return pd.Period(freq="Q", year=year, month=month)
1288-
else:
1289-
return pd.NaT
1290-
1291-
1292-
def _make_sample_period_year(row):
1293-
year = row.year
1294-
if year > 0:
1295-
return pd.Period(freq="Y", year=year)
1296-
else:
1297-
return pd.NaT
1298-
1299-
1300-
def _prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
1301-
# Take a copy, as we will modify the dataframe.
1302-
df_samples = df_samples.copy()
1303-
1304-
# Fix "intermediate" or "unassigned" taxon values - we only want to build
1305-
# cohorts with clean taxon calls, so we set other values to None.
1306-
loc_intermediate_taxon = (
1307-
df_samples["taxon"].str.startswith("intermediate").fillna(False)
1308-
)
1309-
df_samples.loc[loc_intermediate_taxon, "taxon"] = None
1310-
loc_unassigned_taxon = (
1311-
df_samples["taxon"].str.startswith("unassigned").fillna(False)
1312-
)
1313-
df_samples.loc[loc_unassigned_taxon, "taxon"] = None
1314-
1315-
# Add period column.
1316-
if period_by == "year":
1317-
make_period = _make_sample_period_year
1318-
elif period_by == "quarter":
1319-
make_period = _make_sample_period_quarter
1320-
elif period_by == "month":
1321-
make_period = _make_sample_period_month
1322-
else: # pragma: no cover
1323-
raise ValueError(
1324-
f"Value for period_by parameter must be one of 'year', 'quarter', 'month'; found {period_by!r}."
1325-
)
1326-
sample_period = df_samples.apply(make_period, axis="columns")
1327-
df_samples["period"] = sample_period
1328-
1329-
# Add area column for consistent output.
1330-
df_samples["area"] = df_samples[area_by]
1331-
1332-
return df_samples
1333-
1334-
1335-
def _build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_size):
1336-
# Build cohorts dataframe.
1337-
df_cohorts = group_samples_by_cohort.agg(
1338-
size=("sample_id", len),
1339-
lat_mean=("latitude", "mean"),
1340-
lat_max=("latitude", "max"),
1341-
lat_min=("latitude", "min"),
1342-
lon_mean=("longitude", "mean"),
1343-
lon_max=("longitude", "max"),
1344-
lon_min=("longitude", "min"),
1345-
)
1346-
# Reset index so that the index fields are included as columns.
1347-
df_cohorts = df_cohorts.reset_index()
1348-
1349-
# Add cohort helper variables.
1350-
cohort_period_start = df_cohorts["period"].apply(lambda v: v.start_time)
1351-
cohort_period_end = df_cohorts["period"].apply(lambda v: v.end_time)
1352-
df_cohorts["period_start"] = cohort_period_start
1353-
df_cohorts["period_end"] = cohort_period_end
1354-
# Create a label that is similar to the cohort metadata,
1355-
# although this won't be perfect.
1356-
df_cohorts["label"] = df_cohorts.apply(
1357-
lambda v: f"{v.area}_{v.taxon[:4]}_{v.period}", axis="columns"
1358-
)
1359-
1360-
# Apply minimum cohort size.
1361-
df_cohorts = df_cohorts.query(f"size >= {min_cohort_size}").reset_index(drop=True)
1362-
1363-
return df_cohorts
1364-
1365-
13661280
def _cohort_alt_allele_counts_melt(*, gt, indices, max_allele):
13671281
ac_alt_melt, an = _cohort_alt_allele_counts_melt_kernel(gt, indices, max_allele)
13681282
an_melt = np.repeat(an, max_allele, axis=0)
@@ -1396,20 +1310,6 @@ def _cohort_alt_allele_counts_melt_kernel(
13961310
return ac_alt_melt, an
13971311

13981312

1399-
def _add_frequency_ci(*, ds, ci_method):
1400-
from statsmodels.stats.proportion import proportion_confint # type: ignore
1401-
1402-
if ci_method is not None:
1403-
count = ds["event_count"].values
1404-
nobs = ds["event_nobs"].values
1405-
with np.errstate(divide="ignore", invalid="ignore"):
1406-
frq_ci_low, frq_ci_upp = proportion_confint(
1407-
count=count, nobs=nobs, method=ci_method
1408-
)
1409-
ds["event_frequency_ci_low"] = ("variants", "cohorts"), frq_ci_low
1410-
ds["event_frequency_ci_upp"] = ("variants", "cohorts"), frq_ci_upp
1411-
1412-
14131313
def _map_snp_to_aa_change_frq_ds(ds):
14141314
# Keep only variables that make sense for amino acid substitutions.
14151315
keep_vars = [

malariagen_data/util.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1585,3 +1585,115 @@ def _karyotype_tags_n_alt(gt, alts, inversion_alts):
15851585
inv_n_alt[i, j] = n_tag_alleles
15861586

15871587
return inv_n_alt
1588+
1589+
1590+
def _prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
1591+
# Take a copy, as we will modify the dataframe.
1592+
df_samples = df_samples.copy()
1593+
1594+
# Fix "intermediate" or "unassigned" taxon values - we only want to build
1595+
# cohorts with clean taxon calls, so we set other values to None.
1596+
loc_intermediate_taxon = (
1597+
df_samples["taxon"].str.startswith("intermediate").fillna(False)
1598+
)
1599+
df_samples.loc[loc_intermediate_taxon, "taxon"] = None
1600+
loc_unassigned_taxon = (
1601+
df_samples["taxon"].str.startswith("unassigned").fillna(False)
1602+
)
1603+
df_samples.loc[loc_unassigned_taxon, "taxon"] = None
1604+
1605+
# Add period column.
1606+
if period_by == "year":
1607+
make_period = _make_sample_period_year
1608+
elif period_by == "quarter":
1609+
make_period = _make_sample_period_quarter
1610+
elif period_by == "month":
1611+
make_period = _make_sample_period_month
1612+
else: # pragma: no cover
1613+
raise ValueError(
1614+
f"Value for period_by parameter must be one of 'year', 'quarter', 'month'; found {period_by!r}."
1615+
)
1616+
sample_period = df_samples.apply(make_period, axis="columns")
1617+
df_samples["period"] = sample_period
1618+
1619+
# Add area column for consistent output.
1620+
df_samples["area"] = df_samples[area_by]
1621+
1622+
return df_samples
1623+
1624+
1625+
def _build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_size):
1626+
# Build cohorts dataframe.
1627+
df_cohorts = group_samples_by_cohort.agg(
1628+
size=("sample_id", len),
1629+
lat_mean=("latitude", "mean"),
1630+
lat_max=("latitude", "max"),
1631+
lat_min=("latitude", "min"),
1632+
lon_mean=("longitude", "mean"),
1633+
lon_max=("longitude", "max"),
1634+
lon_min=("longitude", "min"),
1635+
)
1636+
# Reset index so that the index fields are included as columns.
1637+
df_cohorts = df_cohorts.reset_index()
1638+
1639+
# Add cohort helper variables.
1640+
cohort_period_start = df_cohorts["period"].apply(lambda v: v.start_time)
1641+
cohort_period_end = df_cohorts["period"].apply(lambda v: v.end_time)
1642+
df_cohorts["period_start"] = cohort_period_start
1643+
df_cohorts["period_end"] = cohort_period_end
1644+
# Create a label that is similar to the cohort metadata,
1645+
# although this won't be perfect.
1646+
df_cohorts["label"] = df_cohorts.apply(
1647+
lambda v: f"{v.area}_{v.taxon[:4]}_{v.period}", axis="columns"
1648+
)
1649+
1650+
# Apply minimum cohort size.
1651+
df_cohorts = df_cohorts.query(f"size >= {min_cohort_size}").reset_index(drop=True)
1652+
1653+
# Early check for no cohorts.
1654+
if len(df_cohorts) == 0:
1655+
raise ValueError(
1656+
"No cohorts available for the given sample selection parameters and minimum cohort size."
1657+
)
1658+
1659+
return df_cohorts
1660+
1661+
1662+
def _add_frequency_ci(*, ds, ci_method):
1663+
from statsmodels.stats.proportion import proportion_confint # type: ignore
1664+
1665+
if ci_method is not None:
1666+
count = ds["event_count"].values
1667+
nobs = ds["event_nobs"].values
1668+
with np.errstate(divide="ignore", invalid="ignore"):
1669+
frq_ci_low, frq_ci_upp = proportion_confint(
1670+
count=count, nobs=nobs, method=ci_method
1671+
)
1672+
ds["event_frequency_ci_low"] = ("variants", "cohorts"), frq_ci_low
1673+
ds["event_frequency_ci_upp"] = ("variants", "cohorts"), frq_ci_upp
1674+
1675+
1676+
def _make_sample_period_month(row):
1677+
year = row.year
1678+
month = row.month
1679+
if year > 0 and month > 0:
1680+
return pd.Period(freq="M", year=year, month=month)
1681+
else:
1682+
return pd.NaT
1683+
1684+
1685+
def _make_sample_period_quarter(row):
1686+
year = row.year
1687+
month = row.month
1688+
if year > 0 and month > 0:
1689+
return pd.Period(freq="Q", year=year, month=month)
1690+
else:
1691+
return pd.NaT
1692+
1693+
1694+
def _make_sample_period_year(row):
1695+
year = row.year
1696+
if year > 0:
1697+
return pd.Period(freq="Y", year=year)
1698+
else:
1699+
return pd.NaT

0 commit comments

Comments
 (0)