|
| 1 | +from typing import Optional |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +import numpy as np |
| 5 | +import xarray as xr |
| 6 | +import allel |
| 7 | +from numpydoc_decorator import doc # type: ignore |
| 8 | + |
| 9 | +from ..util import ( |
| 10 | + check_types, |
| 11 | + haplotype_frequencies, |
| 12 | + prep_samples_for_cohort_grouping, |
| 13 | + build_cohorts_from_sample_grouping, |
| 14 | + add_frequency_ci, |
| 15 | +) |
| 16 | +from .hap_data import AnophelesHapData |
| 17 | +from .sample_metadata import locate_cohorts |
| 18 | +from . import base_params, frq_params |
| 19 | + |
| 20 | + |
| 21 | +class AnophelesHapFrequencyAnalysis( |
| 22 | + AnophelesHapData, |
| 23 | +): |
| 24 | + def __init__( |
| 25 | + self, |
| 26 | + **kwargs, |
| 27 | + ): |
| 28 | + # N.B., this class is designed to work cooperatively, and |
| 29 | + # so it's important that any remaining parameters are passed |
| 30 | + # to the superclass constructor. |
| 31 | + super().__init__(**kwargs) |
| 32 | + |
| 33 | + @check_types |
| 34 | + @doc( |
| 35 | + summary=""" |
| 36 | + Compute haplotype frequencies for a region. |
| 37 | + """, |
| 38 | + returns=""" |
| 39 | + A dataframe of haplotype frequencies, one row per haplotype. |
| 40 | + """, |
| 41 | + notes=""" |
| 42 | + Cohorts with fewer samples than `min_cohort_size` will be excluded from |
| 43 | + output data frame. |
| 44 | + """, |
| 45 | + ) |
| 46 | + def haplotypes_frequencies( |
| 47 | + self, |
| 48 | + region: base_params.region, |
| 49 | + cohorts: base_params.cohorts, |
| 50 | + sample_query: Optional[base_params.sample_query] = None, |
| 51 | + sample_query_options: Optional[base_params.sample_query_options] = None, |
| 52 | + min_cohort_size: base_params.min_cohort_size = 10, |
| 53 | + sample_sets: Optional[base_params.sample_sets] = None, |
| 54 | + chunks: base_params.chunks = base_params.native_chunks, |
| 55 | + inline_array: base_params.inline_array = base_params.inline_array_default, |
| 56 | + ) -> pd.DataFrame: |
| 57 | + # Access sample metadata. |
| 58 | + df_samples = self.sample_metadata( |
| 59 | + sample_sets=sample_sets, |
| 60 | + sample_query=sample_query, |
| 61 | + sample_query_options=sample_query_options, |
| 62 | + ) |
| 63 | + |
| 64 | + # Build cohort dictionary, maps cohort labels to boolean indexers. |
| 65 | + coh_dict = locate_cohorts( |
| 66 | + cohorts=cohorts, data=df_samples, min_cohort_size=min_cohort_size |
| 67 | + ) |
| 68 | + |
| 69 | + # Access haplotypes. |
| 70 | + ds_haps = self.haplotypes( |
| 71 | + region=region, |
| 72 | + sample_sets=sample_sets, |
| 73 | + sample_query=sample_query, |
| 74 | + sample_query_options=sample_query_options, |
| 75 | + chunks=chunks, |
| 76 | + inline_array=inline_array, |
| 77 | + ) |
| 78 | + |
| 79 | + # Early check for no SNPs. |
| 80 | + if ds_haps.sizes["variants"] == 0: # pragma: no cover |
| 81 | + raise ValueError("No SNPs available for the given region.") |
| 82 | + |
| 83 | + # Access genotypes. |
| 84 | + gt = allel.GenotypeDaskArray(ds_haps["call_genotype"].data) |
| 85 | + with self._dask_progress(desc="Compute haplotypes"): |
| 86 | + gt = gt.compute() |
| 87 | + |
| 88 | + # List all haplotypes |
| 89 | + gt_hap = gt.to_haplotypes() |
| 90 | + f_all, _, _ = haplotype_frequencies(gt_hap) |
| 91 | + |
| 92 | + # Count haplotypes. |
| 93 | + freq_cols = dict() |
| 94 | + cohorts_iterator = self._progress( |
| 95 | + coh_dict.items(), desc="Compute allele frequencies" |
| 96 | + ) |
| 97 | + for coh, loc_coh in cohorts_iterator: |
| 98 | + # We reset all frequencies to 0 for each cohort |
| 99 | + hap_dict = {k: 0 for k in f_all.keys()} |
| 100 | + |
| 101 | + n_samples = np.count_nonzero(loc_coh) |
| 102 | + assert n_samples >= min_cohort_size |
| 103 | + gt_coh = gt.compress(loc_coh, axis=1) |
| 104 | + gt_hap = gt_coh.to_haplotypes() |
| 105 | + f, _, _ = haplotype_frequencies(gt_hap) |
| 106 | + # The frequencies of the observed haplotypes are then updated |
| 107 | + hap_dict.update(f) |
| 108 | + freq_cols["frq_" + coh] = list(hap_dict.values()) |
| 109 | + |
| 110 | + df_freqs = pd.DataFrame(freq_cols, index=hap_dict.keys()) |
| 111 | + |
| 112 | + # Compute max_af. |
| 113 | + df_max_af = pd.DataFrame({"max_af": df_freqs.max(axis=1)}) |
| 114 | + |
| 115 | + # Build the final dataframe. |
| 116 | + df_haps = pd.concat([df_freqs, df_max_af], axis=1) |
| 117 | + |
| 118 | + df_haps_sorted = df_haps.sort_values(by=["max_af"], ascending=False) |
| 119 | + df_haps_sorted["label"] = ["H" + str(i) for i in range(len(df_haps_sorted))] |
| 120 | + |
| 121 | + # Reset index after filtering. |
| 122 | + df_haps_sorted.set_index(keys="label", drop=True) |
| 123 | + |
| 124 | + return df_haps_sorted |
| 125 | + |
| 126 | + @check_types |
| 127 | + @doc( |
| 128 | + summary=""" |
| 129 | + Group samples by taxon, area (space) and period (time), then compute |
| 130 | + haplotype frequencies. |
| 131 | + """, |
| 132 | + returns=""" |
| 133 | + The resulting dataset contains data has dimensions "cohorts" and |
| 134 | + "variants". Variables prefixed with "cohort" are 1-dimensional |
| 135 | + arrays with data about the cohorts, such as the area, period, taxon |
| 136 | + and cohort size. Variables prefixed with "variant" are |
| 137 | + 1-dimensional arrays with data about the variants, such as the |
| 138 | + contig, position, reference and alternate alleles. Variables |
| 139 | + prefixed with "event" are 2-dimensional arrays with the allele |
| 140 | + counts and frequency calculations. |
| 141 | + """, |
| 142 | + ) |
| 143 | + def haplotypes_frequencies_advanced( |
| 144 | + self, |
| 145 | + region: base_params.region, |
| 146 | + area_by: frq_params.area_by, |
| 147 | + period_by: frq_params.period_by, |
| 148 | + sample_sets: Optional[base_params.sample_sets] = None, |
| 149 | + sample_query: Optional[base_params.sample_query] = None, |
| 150 | + sample_query_options: Optional[base_params.sample_query_options] = None, |
| 151 | + min_cohort_size: base_params.min_cohort_size = 10, |
| 152 | + ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default, |
| 153 | + chunks: base_params.chunks = base_params.native_chunks, |
| 154 | + inline_array: base_params.inline_array = base_params.inline_array_default, |
| 155 | + ) -> xr.Dataset: |
| 156 | + # Load sample metadata. |
| 157 | + df_samples = self.sample_metadata( |
| 158 | + sample_sets=sample_sets, |
| 159 | + sample_query=sample_query, |
| 160 | + sample_query_options=sample_query_options, |
| 161 | + ) |
| 162 | + |
| 163 | + # Prepare sample metadata for cohort grouping. |
| 164 | + df_samples = prep_samples_for_cohort_grouping( |
| 165 | + df_samples=df_samples, |
| 166 | + area_by=area_by, |
| 167 | + period_by=period_by, |
| 168 | + ) |
| 169 | + |
| 170 | + # Group samples to make cohorts. |
| 171 | + group_samples_by_cohort = df_samples.groupby(["taxon", "area", "period"]) |
| 172 | + |
| 173 | + # Build cohorts dataframe. |
| 174 | + df_cohorts = build_cohorts_from_sample_grouping( |
| 175 | + group_samples_by_cohort=group_samples_by_cohort, |
| 176 | + min_cohort_size=min_cohort_size, |
| 177 | + ) |
| 178 | + |
| 179 | + # Access haplotypes. |
| 180 | + ds_haps = self.haplotypes( |
| 181 | + region=region, |
| 182 | + sample_sets=sample_sets, |
| 183 | + sample_query=sample_query, |
| 184 | + sample_query_options=sample_query_options, |
| 185 | + chunks=chunks, |
| 186 | + inline_array=inline_array, |
| 187 | + ) |
| 188 | + |
| 189 | + # Early check for no SNPs. |
| 190 | + if ds_haps.sizes["variants"] == 0: # pragma: no cover |
| 191 | + raise ValueError("No SNPs available for the given region.") |
| 192 | + |
| 193 | + # Access genotypes. |
| 194 | + gt = allel.GenotypeDaskArray(ds_haps["call_genotype"].data) |
| 195 | + with self._dask_progress(desc="Compute haplotypes"): |
| 196 | + gt = gt.compute() |
| 197 | + |
| 198 | + # List all haplotypes |
| 199 | + gt_hap = gt.to_haplotypes() |
| 200 | + f_all, _, _ = haplotype_frequencies(gt_hap) |
| 201 | + |
| 202 | + # Count haplotypes. |
| 203 | + hap_freq: dict[np.int64, float] = dict() |
| 204 | + hap_count: dict[np.int64, int] = dict() |
| 205 | + hap_nob: dict[np.int64, int] = dict() |
| 206 | + freq_cols = dict() |
| 207 | + count_cols = dict() |
| 208 | + nobs_cols = dict() |
| 209 | + cohorts_iterator = self._progress( |
| 210 | + df_cohorts.itertuples(), desc="Compute allele frequencies" |
| 211 | + ) |
| 212 | + for cohort in cohorts_iterator: |
| 213 | + cohort_key = cohort.taxon, cohort.area, cohort.period |
| 214 | + cohort_key_str = cohort.taxon + "_" + cohort.area + "_" + str(cohort.period) |
| 215 | + # We reset all frequencies, counts to 0 for each cohort, nobs is set to the number of haplotypes |
| 216 | + n_samples = cohort.size |
| 217 | + hap_freq = {k: 0 for k in f_all.keys()} |
| 218 | + hap_count = {k: 0 for k in f_all.keys()} |
| 219 | + hap_nob = {k: 2 * n_samples for k in f_all.keys()} |
| 220 | + assert n_samples >= min_cohort_size |
| 221 | + sample_indices = group_samples_by_cohort.indices[cohort_key] |
| 222 | + gt_coh = gt.take(sample_indices, axis=1) |
| 223 | + gt_hap = gt_coh.to_haplotypes() |
| 224 | + f, c, o = haplotype_frequencies(gt_hap) |
| 225 | + # The frequencies and counts of the observed haplotypes are then updated, so are the nobs but the values should actually stay the same |
| 226 | + hap_freq.update(f) |
| 227 | + hap_count.update(c) |
| 228 | + hap_nob.update(o) |
| 229 | + count_cols["count_" + cohort_key_str] = list(hap_count.values()) |
| 230 | + freq_cols["frq_" + cohort_key_str] = list(hap_freq.values()) |
| 231 | + nobs_cols["nobs_" + cohort_key_str] = list(hap_nob.values()) |
| 232 | + |
| 233 | + df_freqs = pd.DataFrame(freq_cols, index=hap_freq.keys()) |
| 234 | + |
| 235 | + # Compute max_af. |
| 236 | + df_max_af = pd.DataFrame({"max_af": df_freqs.max(axis=1)}) |
| 237 | + |
| 238 | + df_counts = pd.DataFrame(count_cols, index=hap_count.keys()) |
| 239 | + |
| 240 | + df_nobs = pd.DataFrame(nobs_cols, index=hap_nob.keys()) |
| 241 | + |
| 242 | + # Build the final dataframe. |
| 243 | + df_haps = pd.concat([df_freqs, df_counts, df_nobs, df_max_af], axis=1) |
| 244 | + |
| 245 | + df_haps_sorted = df_haps.sort_values(by=["max_af"], ascending=False) |
| 246 | + df_haps_sorted["label"] = ["H" + str(i) for i in range(len(df_haps_sorted))] |
| 247 | + |
| 248 | + # Reset index after filtering. |
| 249 | + df_haps_sorted.set_index(keys="label", drop=True) |
| 250 | + |
| 251 | + # Build the output dataset. |
| 252 | + ds_out = xr.Dataset() |
| 253 | + |
| 254 | + # Cohort variables. |
| 255 | + for coh_col in df_cohorts.columns: |
| 256 | + ds_out[f"cohort_{coh_col}"] = "cohorts", df_cohorts[coh_col] |
| 257 | + |
| 258 | + # Label the haplotypes |
| 259 | + ds_out["variant_label"] = "variants", df_haps_sorted["label"] |
| 260 | + # Event variables. |
| 261 | + ds_out["event_frequency"] = ( |
| 262 | + ("variants", "cohorts"), |
| 263 | + df_haps_sorted.to_numpy()[:, : len(df_cohorts)], |
| 264 | + ) |
| 265 | + ds_out["event_count"] = ( |
| 266 | + ("variants", "cohorts"), |
| 267 | + df_haps_sorted.to_numpy()[:, len(df_cohorts) : 2 * len(df_cohorts)], |
| 268 | + ) |
| 269 | + ds_out["event_nobs"] = ( |
| 270 | + ("variants", "cohorts"), |
| 271 | + df_haps_sorted.to_numpy()[:, 2 * len(df_cohorts) : -2], |
| 272 | + ) |
| 273 | + |
| 274 | + # Add confidence intervals. |
| 275 | + add_frequency_ci(ds=ds_out, ci_method=ci_method) |
| 276 | + |
| 277 | + return ds_out |
0 commit comments