Skip to content

Commit 2e047c0

Browse files
timothymillarmergify[bot]
authored andcommitted
Pass genotype counts by coord in hardy_weinberg_test
1 parent 37d3d29 commit 2e047c0

File tree

2 files changed

+52
-14
lines changed

2 files changed

+52
-14
lines changed

sgkit/stats/hwe.py

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ def hardy_weinberg_test(
149149
Name of variable containing precomputed genotype counts for each variant
150150
as described in :data:`sgkit.variables.variant_genotype_count_spec`.
151151
If the variable is not present in ``ds``, it will be computed
152-
using :func:`count_variant_genotypes`.
152+
using :func:`count_variant_genotypes` which automatically assigns
153+
coordinates to the ``genotypes`` dimension.
153154
ploidy
154155
Genotype ploidy, defaults to ``ploidy`` dimension of provided dataset.
155156
If the `ploidy` dimension is not present, then this value must be set explicitly.
@@ -168,10 +169,11 @@ def hardy_weinberg_test(
168169
169170
Warnings
170171
--------
171-
This function is only applicable to diploid, biallelic datasets and the
172-
``genotype_count`` array should have three columns which respectively
173-
contain counts for homozygous reference, heterozygous, and homozygous
174-
alternate genotypes.
172+
This function is only applicable to diploid, biallelic datasets.
173+
The ``genotype_count`` array should have three columns corresponding
174+
to the ``genotypes`` dimension. These columns should have coordinates
175+
``'0/0'``, ``'0/1'``, and ``'1/1'`` which respectively contain counts for
176+
homozygous reference, heterozygous, and homozygous alternate genotypes.
175177
176178
Returns
177179
-------
@@ -180,18 +182,24 @@ def hardy_weinberg_test(
180182
variant_hwe_p_value : [array-like, shape: (N, O)]
181183
P values from HWE test for each variant as float in [0, 1].
182184
185+
Raises
186+
------
187+
NotImplementedError
188+
If the dataset is not limited to biallelic, diploid genotypes.
189+
ValueError
190+
If the ploidy or number of alleles are not specified and not
191+
present as dimensions in the dataset.
192+
ValueError
193+
If no coordinates are assigned to the ``genotypes`` dimension.
194+
KeyError
195+
If the genotypes ``'0/0'``, ``'0/1'`` or ``'1/1'`` are not specified
196+
as coordinates of the ``genotypes`` dimension.
197+
183198
References
184199
----------
185200
- [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005.
186201
“A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
187202
Human Genetics 76 (5): 887–93.
188-
189-
Raises
190-
------
191-
NotImplementedError
192-
If ploidy of provided dataset != 2
193-
NotImplementedError
194-
If maximum number of alleles in provided dataset != 2
195203
"""
196204
ploidy = ploidy or ds.dims.get("ploidy")
197205
if not ploidy:
@@ -214,8 +222,18 @@ def hardy_weinberg_test(
214222
)
215223
variables.validate(ds, {genotype_count: variables.variant_genotype_count_spec})
216224

217-
# extract counts by type
218-
obs_hom1, obs_hets, obs_hom2 = list(da.asarray(ds[genotype_count].data).T)
225+
# extract counts by coords
226+
if "genotypes" not in ds.coords:
227+
raise ValueError("No coordinates for dimension 'genotypes'")
228+
try:
229+
key = "0/0"
230+
obs_hom1 = da.asarray(ds[genotype_count].loc[:, key].data)
231+
key = "0/1"
232+
obs_hets = da.asarray(ds[genotype_count].loc[:, key].data)
233+
key = "1/1"
234+
obs_hom2 = da.asarray(ds[genotype_count].loc[:, key].data)
235+
except KeyError as e:
236+
raise KeyError("No counts for genotype '{}'".format(key)) from e
219237

220238
# note obs_het is expected first
221239
p = da.map_blocks(hardy_weinberg_p_value_vec_jit, obs_hets, obs_hom1, obs_hom2)

sgkit/tests/test_hwe.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ def test_hwep_dataset__precomputed_counts(ds_neq: Dataset) -> None:
139139
cts = [0, 1, 2] # arg order: hom1, hets, hom2
140140
gtc = xr.concat([(ac == ct).sum(dim="samples") for ct in cts], dim="genotypes").T
141141
ds = ds.assign(**{"variant_genotype_count": gtc})
142+
ds = ds.assign_coords(dict(genotypes=["0/0", "0/1", "1/1"]))
142143
p = hwep_test(ds, genotype_count="variant_genotype_count", merge=False)[
143144
"variant_hwe_p_value"
144145
].values
@@ -177,3 +178,22 @@ def test_hwep_dataset__raise_on_nonbiallelic():
177178
):
178179
ds = xr.Dataset({"x": (("ploidy", "alleles"), np.zeros((2, 3)))})
179180
hwep_test(ds)
181+
182+
183+
def test_hwep__raise_on_no_coords():
184+
ds = simulate_genotype_call_dataset(n_variant=10, n_sample=5, n_allele=2, seed=0)
185+
ds["variant_genotype_count"] = ["variants", "genotypes"], np.ones(
186+
(10, 3), dtype=int
187+
)
188+
with pytest.raises(ValueError, match="No coordinates for dimension 'genotypes'"):
189+
hwep_test(ds)
190+
191+
192+
def test_hwep__raise_on_missing_key():
193+
ds = simulate_genotype_call_dataset(n_variant=10, n_sample=5, n_allele=2, seed=0)
194+
ds["variant_genotype_count"] = ["variants", "genotypes"], np.ones(
195+
(10, 3), dtype=int
196+
)
197+
ds = ds.assign_coords(dict(genotypes=["0/0", "0/", "1/1"])) # malformed
198+
with pytest.raises(KeyError, match="No counts for genotype '0/1'"):
199+
hwep_test(ds)

0 commit comments

Comments
 (0)