Skip to content

Commit ef6991f

Browse files
authored
Merge pull request #630 from malariagen/367-haps-freq
Haplotype frequency analysis methods
2 parents d6d9d13 + 1321ede commit ef6991f

File tree

12 files changed

+785
-154
lines changed

12 files changed

+785
-154
lines changed

docs/source/Af1.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ SNP and CNV frequency analysis
115115
aa_allele_frequencies_advanced
116116
gene_cnv_frequencies
117117
gene_cnv_frequencies_advanced
118+
haplotypes_frequencies
119+
haplotypes_frequencies_advanced
118120
plot_frequencies_heatmap
119121
plot_frequencies_time_series
120122
plot_frequencies_interactive_map

docs/source/Ag3.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@ SNP and CNV frequency analysis
125125
aa_allele_frequencies_advanced
126126
gene_cnv_frequencies
127127
gene_cnv_frequencies_advanced
128+
haplotypes_frequencies
129+
haplotypes_frequencies_advanced
128130
plot_frequencies_heatmap
129131
plot_frequencies_time_series
130132
plot_frequencies_interactive_map

malariagen_data/anoph/h12.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from collections import Counter
21
from typing import Optional, Tuple, Dict, Mapping
32

43
import allel # type: ignore
@@ -7,7 +6,7 @@
76
import bokeh.plotting
87

98
from .hap_data import AnophelesHapData
10-
from ..util import hash_columns, check_types, CacheMiss
9+
from ..util import check_types, CacheMiss, haplotype_frequencies
1110
from . import base_params
1211
from . import h12_params, gplt_params, hap_params
1312

@@ -840,21 +839,11 @@ def plot_h12_gwss_multi_panel(
840839
return fig
841840

842841

843-
def haplotype_frequencies(h):
844-
"""Compute haplotype frequencies, returning a dictionary that maps
845-
haplotype hash values to frequencies."""
846-
n = h.shape[1]
847-
hashes = hash_columns(np.asarray(h))
848-
counts = Counter(hashes)
849-
freqs = {key: count / n for key, count in counts.items()}
850-
return freqs
851-
852-
853842
def garud_h12(ht):
854843
"""Compute Garud's H12."""
855844

856845
# Compute haplotype frequencies.
857-
frq_counter = haplotype_frequencies(ht)
846+
frq_counter, _, _ = haplotype_frequencies(ht)
858847

859848
# Convert to array of sorted frequencies.
860849
f = np.sort(np.fromiter(frq_counter.values(), dtype=float))[::-1]

malariagen_data/anoph/h1x.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,9 @@
66
import bokeh.plotting
77

88
from .hap_data import AnophelesHapData
9-
from ..util import check_types, CacheMiss
9+
from ..util import check_types, CacheMiss, haplotype_frequencies
1010
from . import base_params
1111
from . import h12_params, gplt_params, hap_params
12-
from .h12 import haplotype_frequencies
1312

1413

1514
class AnophelesH1XAnalysis(
@@ -368,8 +367,8 @@ def haplotype_joint_frequencies(ha, hb):
368367
"""Compute the joint frequency of haplotypes in two difference
369368
cohorts. Returns a dictionary mapping haplotype hash values to
370369
the product of frequencies in each cohort."""
371-
frqa = haplotype_frequencies(ha)
372-
frqb = haplotype_frequencies(hb)
370+
frqa, _, _ = haplotype_frequencies(ha)
371+
frqb, _, _ = haplotype_frequencies(hb)
373372
keys = set(frqa.keys()) | set(frqb.keys())
374373
joint_freqs = {key: frqa.get(key, 0) * frqb.get(key, 0) for key in keys}
375374
return joint_freqs

malariagen_data/anoph/hap_frq.py

Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
from typing import Optional
2+
3+
import pandas as pd
4+
import numpy as np
5+
import xarray as xr
6+
import allel
7+
from numpydoc_decorator import doc # type: ignore
8+
9+
from ..util import (
10+
check_types,
11+
haplotype_frequencies,
12+
prep_samples_for_cohort_grouping,
13+
build_cohorts_from_sample_grouping,
14+
add_frequency_ci,
15+
)
16+
from .hap_data import AnophelesHapData
17+
from .sample_metadata import locate_cohorts
18+
from . import base_params, frq_params
19+
20+
21+
class AnophelesHapFrequencyAnalysis(
22+
AnophelesHapData,
23+
):
24+
def __init__(
25+
self,
26+
**kwargs,
27+
):
28+
# N.B., this class is designed to work cooperatively, and
29+
# so it's important that any remaining parameters are passed
30+
# to the superclass constructor.
31+
super().__init__(**kwargs)
32+
33+
@check_types
34+
@doc(
35+
summary="""
36+
Compute haplotype frequencies for a region.
37+
""",
38+
returns="""
39+
A dataframe of haplotype frequencies, one row per haplotype.
40+
""",
41+
notes="""
42+
Cohorts with fewer samples than `min_cohort_size` will be excluded from
43+
output data frame.
44+
""",
45+
)
46+
def haplotypes_frequencies(
47+
self,
48+
region: base_params.region,
49+
cohorts: base_params.cohorts,
50+
sample_query: Optional[base_params.sample_query] = None,
51+
sample_query_options: Optional[base_params.sample_query_options] = None,
52+
min_cohort_size: base_params.min_cohort_size = 10,
53+
sample_sets: Optional[base_params.sample_sets] = None,
54+
chunks: base_params.chunks = base_params.native_chunks,
55+
inline_array: base_params.inline_array = base_params.inline_array_default,
56+
) -> pd.DataFrame:
57+
# Access sample metadata.
58+
df_samples = self.sample_metadata(
59+
sample_sets=sample_sets,
60+
sample_query=sample_query,
61+
sample_query_options=sample_query_options,
62+
)
63+
64+
# Build cohort dictionary, maps cohort labels to boolean indexers.
65+
coh_dict = locate_cohorts(
66+
cohorts=cohorts, data=df_samples, min_cohort_size=min_cohort_size
67+
)
68+
69+
# Access haplotypes.
70+
ds_haps = self.haplotypes(
71+
region=region,
72+
sample_sets=sample_sets,
73+
sample_query=sample_query,
74+
sample_query_options=sample_query_options,
75+
chunks=chunks,
76+
inline_array=inline_array,
77+
)
78+
79+
# Early check for no SNPs.
80+
if ds_haps.sizes["variants"] == 0: # pragma: no cover
81+
raise ValueError("No SNPs available for the given region.")
82+
83+
# Access genotypes.
84+
gt = allel.GenotypeDaskArray(ds_haps["call_genotype"].data)
85+
with self._dask_progress(desc="Compute haplotypes"):
86+
gt = gt.compute()
87+
88+
# List all haplotypes
89+
gt_hap = gt.to_haplotypes()
90+
f_all, _, _ = haplotype_frequencies(gt_hap)
91+
92+
# Count haplotypes.
93+
freq_cols = dict()
94+
cohorts_iterator = self._progress(
95+
coh_dict.items(), desc="Compute allele frequencies"
96+
)
97+
for coh, loc_coh in cohorts_iterator:
98+
# We reset all frequencies to 0 for each cohort
99+
hap_dict = {k: 0 for k in f_all.keys()}
100+
101+
n_samples = np.count_nonzero(loc_coh)
102+
assert n_samples >= min_cohort_size
103+
gt_coh = gt.compress(loc_coh, axis=1)
104+
gt_hap = gt_coh.to_haplotypes()
105+
f, _, _ = haplotype_frequencies(gt_hap)
106+
# The frequencies of the observed haplotypes are then updated
107+
hap_dict.update(f)
108+
freq_cols["frq_" + coh] = list(hap_dict.values())
109+
110+
df_freqs = pd.DataFrame(freq_cols, index=hap_dict.keys())
111+
112+
# Compute max_af.
113+
df_max_af = pd.DataFrame({"max_af": df_freqs.max(axis=1)})
114+
115+
# Build the final dataframe.
116+
df_haps = pd.concat([df_freqs, df_max_af], axis=1)
117+
118+
df_haps_sorted = df_haps.sort_values(by=["max_af"], ascending=False)
119+
df_haps_sorted["label"] = ["H" + str(i) for i in range(len(df_haps_sorted))]
120+
121+
# Reset index after filtering.
122+
df_haps_sorted.set_index(keys="label", drop=True)
123+
124+
return df_haps_sorted
125+
126+
@check_types
127+
@doc(
128+
summary="""
129+
Group samples by taxon, area (space) and period (time), then compute
130+
haplotype frequencies.
131+
""",
132+
returns="""
133+
The resulting dataset contains data has dimensions "cohorts" and
134+
"variants". Variables prefixed with "cohort" are 1-dimensional
135+
arrays with data about the cohorts, such as the area, period, taxon
136+
and cohort size. Variables prefixed with "variant" are
137+
1-dimensional arrays with data about the variants, such as the
138+
contig, position, reference and alternate alleles. Variables
139+
prefixed with "event" are 2-dimensional arrays with the allele
140+
counts and frequency calculations.
141+
""",
142+
)
143+
def haplotypes_frequencies_advanced(
144+
self,
145+
region: base_params.region,
146+
area_by: frq_params.area_by,
147+
period_by: frq_params.period_by,
148+
sample_sets: Optional[base_params.sample_sets] = None,
149+
sample_query: Optional[base_params.sample_query] = None,
150+
sample_query_options: Optional[base_params.sample_query_options] = None,
151+
min_cohort_size: base_params.min_cohort_size = 10,
152+
ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
153+
chunks: base_params.chunks = base_params.native_chunks,
154+
inline_array: base_params.inline_array = base_params.inline_array_default,
155+
) -> xr.Dataset:
156+
# Load sample metadata.
157+
df_samples = self.sample_metadata(
158+
sample_sets=sample_sets,
159+
sample_query=sample_query,
160+
sample_query_options=sample_query_options,
161+
)
162+
163+
# Prepare sample metadata for cohort grouping.
164+
df_samples = prep_samples_for_cohort_grouping(
165+
df_samples=df_samples,
166+
area_by=area_by,
167+
period_by=period_by,
168+
)
169+
170+
# Group samples to make cohorts.
171+
group_samples_by_cohort = df_samples.groupby(["taxon", "area", "period"])
172+
173+
# Build cohorts dataframe.
174+
df_cohorts = build_cohorts_from_sample_grouping(
175+
group_samples_by_cohort=group_samples_by_cohort,
176+
min_cohort_size=min_cohort_size,
177+
)
178+
179+
# Access haplotypes.
180+
ds_haps = self.haplotypes(
181+
region=region,
182+
sample_sets=sample_sets,
183+
sample_query=sample_query,
184+
sample_query_options=sample_query_options,
185+
chunks=chunks,
186+
inline_array=inline_array,
187+
)
188+
189+
# Early check for no SNPs.
190+
if ds_haps.sizes["variants"] == 0: # pragma: no cover
191+
raise ValueError("No SNPs available for the given region.")
192+
193+
# Access genotypes.
194+
gt = allel.GenotypeDaskArray(ds_haps["call_genotype"].data)
195+
with self._dask_progress(desc="Compute haplotypes"):
196+
gt = gt.compute()
197+
198+
# List all haplotypes
199+
gt_hap = gt.to_haplotypes()
200+
f_all, _, _ = haplotype_frequencies(gt_hap)
201+
202+
# Count haplotypes.
203+
hap_freq: dict[np.int64, float] = dict()
204+
hap_count: dict[np.int64, int] = dict()
205+
hap_nob: dict[np.int64, int] = dict()
206+
freq_cols = dict()
207+
count_cols = dict()
208+
nobs_cols = dict()
209+
cohorts_iterator = self._progress(
210+
df_cohorts.itertuples(), desc="Compute allele frequencies"
211+
)
212+
for cohort in cohorts_iterator:
213+
cohort_key = cohort.taxon, cohort.area, cohort.period
214+
cohort_key_str = cohort.taxon + "_" + cohort.area + "_" + str(cohort.period)
215+
# We reset all frequencies, counts to 0 for each cohort, nobs is set to the number of haplotypes
216+
n_samples = cohort.size
217+
hap_freq = {k: 0 for k in f_all.keys()}
218+
hap_count = {k: 0 for k in f_all.keys()}
219+
hap_nob = {k: 2 * n_samples for k in f_all.keys()}
220+
assert n_samples >= min_cohort_size
221+
sample_indices = group_samples_by_cohort.indices[cohort_key]
222+
gt_coh = gt.take(sample_indices, axis=1)
223+
gt_hap = gt_coh.to_haplotypes()
224+
f, c, o = haplotype_frequencies(gt_hap)
225+
# The frequencies and counts of the observed haplotypes are then updated, so are the nobs but the values should actually stay the same
226+
hap_freq.update(f)
227+
hap_count.update(c)
228+
hap_nob.update(o)
229+
count_cols["count_" + cohort_key_str] = list(hap_count.values())
230+
freq_cols["frq_" + cohort_key_str] = list(hap_freq.values())
231+
nobs_cols["nobs_" + cohort_key_str] = list(hap_nob.values())
232+
233+
df_freqs = pd.DataFrame(freq_cols, index=hap_freq.keys())
234+
235+
# Compute max_af.
236+
df_max_af = pd.DataFrame({"max_af": df_freqs.max(axis=1)})
237+
238+
df_counts = pd.DataFrame(count_cols, index=hap_count.keys())
239+
240+
df_nobs = pd.DataFrame(nobs_cols, index=hap_nob.keys())
241+
242+
# Build the final dataframe.
243+
df_haps = pd.concat([df_freqs, df_counts, df_nobs, df_max_af], axis=1)
244+
245+
df_haps_sorted = df_haps.sort_values(by=["max_af"], ascending=False)
246+
df_haps_sorted["label"] = ["H" + str(i) for i in range(len(df_haps_sorted))]
247+
248+
# Reset index after filtering.
249+
df_haps_sorted.set_index(keys="label", drop=True)
250+
251+
# Build the output dataset.
252+
ds_out = xr.Dataset()
253+
254+
# Cohort variables.
255+
for coh_col in df_cohorts.columns:
256+
ds_out[f"cohort_{coh_col}"] = "cohorts", df_cohorts[coh_col]
257+
258+
# Label the haplotypes
259+
ds_out["variant_label"] = "variants", df_haps_sorted["label"]
260+
# Event variables.
261+
ds_out["event_frequency"] = (
262+
("variants", "cohorts"),
263+
df_haps_sorted.to_numpy()[:, : len(df_cohorts)],
264+
)
265+
ds_out["event_count"] = (
266+
("variants", "cohorts"),
267+
df_haps_sorted.to_numpy()[:, len(df_cohorts) : 2 * len(df_cohorts)],
268+
)
269+
ds_out["event_nobs"] = (
270+
("variants", "cohorts"),
271+
df_haps_sorted.to_numpy()[:, 2 * len(df_cohorts) : -2],
272+
)
273+
274+
# Add confidence intervals.
275+
add_frequency_ci(ds=ds_out, ci_method=ci_method)
276+
277+
return ds_out

malariagen_data/anoph/sample_metadata.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1343,7 +1343,7 @@ def plot_sample_location_geo(
13431343
return fig
13441344

13451345

1346-
def locate_cohorts(*, cohorts, data):
1346+
def locate_cohorts(*, cohorts, data, min_cohort_size):
13471347
# Build cohort dictionary where key=cohort_id, value=loc_coh.
13481348
coh_dict = {}
13491349

@@ -1374,4 +1374,17 @@ def locate_cohorts(*, cohorts, data):
13741374
loc_coh = data[cohorts] == coh
13751375
coh_dict[coh] = loc_coh.values
13761376

1377+
# Remove cohorts below minimum cohort size.
1378+
coh_dict = {
1379+
coh: loc_coh
1380+
for coh, loc_coh in coh_dict.items()
1381+
if np.count_nonzero(loc_coh) >= min_cohort_size
1382+
}
1383+
1384+
# Early check for no cohorts.
1385+
if len(coh_dict) == 0:
1386+
raise ValueError(
1387+
"No cohorts available for the given sample selection parameters and minimum cohort size."
1388+
)
1389+
13771390
return coh_dict

0 commit comments

Comments
 (0)