|
11 | 11 | import plotly.express as px # type: ignore
|
12 | 12 |
|
13 | 13 | from .. import veff
|
14 |
| -from ..util import check_types, pandas_apply |
| 14 | +from ..util import ( |
| 15 | + check_types, |
| 16 | + pandas_apply, |
| 17 | + _prep_samples_for_cohort_grouping, |
| 18 | + _build_cohorts_from_sample_grouping, |
| 19 | + _add_frequency_ci, |
| 20 | +) |
15 | 21 | from .snp_data import AnophelesSnpData
|
16 | 22 | from .sample_metadata import locate_cohorts
|
17 | 23 | from . import base_params, frq_params, map_params, plotly_params
|
@@ -1271,98 +1277,6 @@ def _make_snp_label_aa(aa_change, contig, position, ref_allele, alt_allele):
|
1271 | 1277 | return label
|
1272 | 1278 |
|
1273 | 1279 |
|
1274 |
| -def _make_sample_period_month(row): |
1275 |
| - year = row.year |
1276 |
| - month = row.month |
1277 |
| - if year > 0 and month > 0: |
1278 |
| - return pd.Period(freq="M", year=year, month=month) |
1279 |
| - else: |
1280 |
| - return pd.NaT |
1281 |
| - |
1282 |
| - |
1283 |
| -def _make_sample_period_quarter(row): |
1284 |
| - year = row.year |
1285 |
| - month = row.month |
1286 |
| - if year > 0 and month > 0: |
1287 |
| - return pd.Period(freq="Q", year=year, month=month) |
1288 |
| - else: |
1289 |
| - return pd.NaT |
1290 |
| - |
1291 |
| - |
1292 |
| -def _make_sample_period_year(row): |
1293 |
| - year = row.year |
1294 |
| - if year > 0: |
1295 |
| - return pd.Period(freq="Y", year=year) |
1296 |
| - else: |
1297 |
| - return pd.NaT |
1298 |
| - |
1299 |
| - |
1300 |
| -def _prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by): |
1301 |
| - # Take a copy, as we will modify the dataframe. |
1302 |
| - df_samples = df_samples.copy() |
1303 |
| - |
1304 |
| - # Fix "intermediate" or "unassigned" taxon values - we only want to build |
1305 |
| - # cohorts with clean taxon calls, so we set other values to None. |
1306 |
| - loc_intermediate_taxon = ( |
1307 |
| - df_samples["taxon"].str.startswith("intermediate").fillna(False) |
1308 |
| - ) |
1309 |
| - df_samples.loc[loc_intermediate_taxon, "taxon"] = None |
1310 |
| - loc_unassigned_taxon = ( |
1311 |
| - df_samples["taxon"].str.startswith("unassigned").fillna(False) |
1312 |
| - ) |
1313 |
| - df_samples.loc[loc_unassigned_taxon, "taxon"] = None |
1314 |
| - |
1315 |
| - # Add period column. |
1316 |
| - if period_by == "year": |
1317 |
| - make_period = _make_sample_period_year |
1318 |
| - elif period_by == "quarter": |
1319 |
| - make_period = _make_sample_period_quarter |
1320 |
| - elif period_by == "month": |
1321 |
| - make_period = _make_sample_period_month |
1322 |
| - else: # pragma: no cover |
1323 |
| - raise ValueError( |
1324 |
| - f"Value for period_by parameter must be one of 'year', 'quarter', 'month'; found {period_by!r}." |
1325 |
| - ) |
1326 |
| - sample_period = df_samples.apply(make_period, axis="columns") |
1327 |
| - df_samples["period"] = sample_period |
1328 |
| - |
1329 |
| - # Add area column for consistent output. |
1330 |
| - df_samples["area"] = df_samples[area_by] |
1331 |
| - |
1332 |
| - return df_samples |
1333 |
| - |
1334 |
| - |
1335 |
| -def _build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_size): |
1336 |
| - # Build cohorts dataframe. |
1337 |
| - df_cohorts = group_samples_by_cohort.agg( |
1338 |
| - size=("sample_id", len), |
1339 |
| - lat_mean=("latitude", "mean"), |
1340 |
| - lat_max=("latitude", "max"), |
1341 |
| - lat_min=("latitude", "min"), |
1342 |
| - lon_mean=("longitude", "mean"), |
1343 |
| - lon_max=("longitude", "max"), |
1344 |
| - lon_min=("longitude", "min"), |
1345 |
| - ) |
1346 |
| - # Reset index so that the index fields are included as columns. |
1347 |
| - df_cohorts = df_cohorts.reset_index() |
1348 |
| - |
1349 |
| - # Add cohort helper variables. |
1350 |
| - cohort_period_start = df_cohorts["period"].apply(lambda v: v.start_time) |
1351 |
| - cohort_period_end = df_cohorts["period"].apply(lambda v: v.end_time) |
1352 |
| - df_cohorts["period_start"] = cohort_period_start |
1353 |
| - df_cohorts["period_end"] = cohort_period_end |
1354 |
| - # Create a label that is similar to the cohort metadata, |
1355 |
| - # although this won't be perfect. |
1356 |
| - df_cohorts["label"] = df_cohorts.apply( |
1357 |
| - lambda v: f"{v.area}_{v.taxon[:4]}_{v.period}", axis="columns" |
1358 |
| - ) |
1359 |
| - |
1360 |
| - # Apply minimum cohort size. |
1361 |
| - df_cohorts = df_cohorts.query(f"size >= {min_cohort_size}").reset_index(drop=True) |
1362 |
| - |
1363 |
| - return df_cohorts |
1364 |
| - |
1365 |
| - |
1366 | 1280 | def _cohort_alt_allele_counts_melt(*, gt, indices, max_allele):
|
1367 | 1281 | ac_alt_melt, an = _cohort_alt_allele_counts_melt_kernel(gt, indices, max_allele)
|
1368 | 1282 | an_melt = np.repeat(an, max_allele, axis=0)
|
@@ -1396,20 +1310,6 @@ def _cohort_alt_allele_counts_melt_kernel(
|
1396 | 1310 | return ac_alt_melt, an
|
1397 | 1311 |
|
1398 | 1312 |
|
1399 |
| -def _add_frequency_ci(*, ds, ci_method): |
1400 |
| - from statsmodels.stats.proportion import proportion_confint # type: ignore |
1401 |
| - |
1402 |
| - if ci_method is not None: |
1403 |
| - count = ds["event_count"].values |
1404 |
| - nobs = ds["event_nobs"].values |
1405 |
| - with np.errstate(divide="ignore", invalid="ignore"): |
1406 |
| - frq_ci_low, frq_ci_upp = proportion_confint( |
1407 |
| - count=count, nobs=nobs, method=ci_method |
1408 |
| - ) |
1409 |
| - ds["event_frequency_ci_low"] = ("variants", "cohorts"), frq_ci_low |
1410 |
| - ds["event_frequency_ci_upp"] = ("variants", "cohorts"), frq_ci_upp |
1411 |
| - |
1412 |
| - |
1413 | 1313 | def _map_snp_to_aa_change_frq_ds(ds):
|
1414 | 1314 | # Keep only variables that make sense for amino acid substitutions.
|
1415 | 1315 | keep_vars = [
|
|
0 commit comments