Skip to content

Commit 919d3a5

Browse files
timothymillarmergify[bot]
authored andcommitted
Endelman-Jannink GRM #1062
1 parent 07e75c2 commit 919d3a5

File tree

4 files changed

+369
-48
lines changed

4 files changed

+369
-48
lines changed

sgkit/stats/grm.py

Lines changed: 108 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Hashable, Optional
1+
from typing import Hashable, Optional, Tuple
22

33
import dask.array as da
44
import numpy as np
@@ -9,13 +9,17 @@
99
from sgkit.typing import ArrayLike
1010
from sgkit.utils import conditional_merge_datasets, create_dataset
1111

12+
EST_VAN_RADEN = "VanRaden"
13+
EST_ENDELMAN_JANNINK = "Endelman-Jannink"
14+
EST_MARTINI = "Martini"
15+
1216

1317
def _grm_VanRaden(
1418
call_dosage: ArrayLike,
1519
ancestral_frequency: ArrayLike,
1620
ploidy: int,
1721
skipna: bool = False,
18-
):
22+
) -> ArrayLike:
1923
ancestral_dosage = ancestral_frequency * ploidy
2024
M = call_dosage - ancestral_dosage[:, None]
2125
if skipna:
@@ -32,18 +36,69 @@ def _grm_VanRaden(
3236
return G
3337

3438

39+
def _shrinkage_Ledoit_Wolf(X: ArrayLike) -> Tuple[ArrayLike, float]:
40+
# shrinkage estimator of Ledoit & Wolf 2004
41+
# following the notation of Endelman & Jannink
42+
# X is an m * n matrix with mean centered columns
43+
m, n = X.shape
44+
S = (X.T @ X) / m
45+
T = da.diag(S).mean() * da.eye(n)
46+
# calculate scaling parameter delta
47+
X2 = X**2
48+
Gamma = X2.T @ X2 / m # E&J eq 25
49+
delta_numerator = (Gamma - S**2).sum() / m # Error in E&J eq 24?
50+
delta_denominator = ((S - T) ** 2).sum() # squared Frobenius norm
51+
delta = delta_numerator / delta_denominator # E&J eq 20
52+
# delta must be in [0 1]
53+
delta = da.maximum(delta, 0.0)
54+
delta = da.minimum(delta, 1.0)
55+
Cov = delta * T + (1 - delta) * S # E&J eq 16
56+
return Cov, delta
57+
58+
59+
def _grm_Endelman_Jannink(
60+
call_dosage: ArrayLike,
61+
ancestral_frequency: ArrayLike,
62+
ploidy: int,
63+
skipna: bool = False,
64+
) -> ArrayLike:
65+
if skipna:
66+
raise NotImplementedError(
67+
f"The 'skipna' option is not implemented for the '{EST_ENDELMAN_JANNINK}' estimator"
68+
)
69+
ancestral_dosage = ancestral_frequency * ploidy
70+
W = call_dosage - ancestral_dosage[:, None]
71+
W_mean = da.nanmean(W, axis=0, keepdims=True)
72+
X = W - W_mean # mean centered
73+
Cov, _ = _shrinkage_Ledoit_Wolf(X)
74+
# E&J eq 17
75+
numerator = Cov + W_mean.T @ W_mean
76+
denominator = ploidy * da.mean(ancestral_frequency * (1 - ancestral_frequency))
77+
G = numerator / denominator
78+
return G
79+
80+
3581
def genomic_relationship(
3682
ds: Dataset,
3783
*,
3884
call_dosage: Hashable = variables.call_dosage,
39-
estimator: Optional[Literal["VanRaden"]] = None,
85+
estimator: Optional[Literal[EST_VAN_RADEN, EST_ENDELMAN_JANNINK]] = None, # type: ignore
4086
ancestral_frequency: Optional[Hashable] = None,
4187
ploidy: Optional[int] = None,
4288
skipna: bool = False,
4389
merge: bool = True,
4490
) -> Dataset:
4591
"""Compute a genomic relationship matrix (AKA the GRM or G-matrix).
4692
93+
94+
The following estimators are supported:
95+
96+
* **VanRaden**: the first estimator described by VanRaden 2008 [1] and
97+
generalized to autopolyploids by Ashraf et al 2016 [2] and Bilton 2020 [3].
98+
* **Endelman-Jannink**: a shrinkage estimator described by Endelman and
99+
Jannick 2012 [4]. This is based on the VanRaden estimator and aims to
100+
improve the accuracy of estimated breeding values with low-density markers.
101+
47102
Parameters
48103
----------
49104
ds
@@ -52,10 +107,8 @@ def genomic_relationship(
52107
Input variable name holding call_dosage as defined by
53108
:data:`sgkit.variables.call_dosage_spec`.
54109
estimator
55-
Specifies a relatedness estimator to use. Currently the only option
56-
is ``"VanRaden"`` which uses the method described by VanRaden 2008 [1]
57-
and generalized to autopolyploids by Ashraf et al 2016 [2] and
58-
Bilton 2020 [3].
110+
Specifies the relatedness estimator to use. Must be one of
111+
``'VanRaden'`` (the default) or ``'Endelman-Jannink'``.
59112
ancestral_frequency
60113
Frequency of variant alleles corresponding to call_dosage within
61114
the ancestral/base/reference population.
@@ -83,10 +136,6 @@ def genomic_relationship(
83136
which is a matrix of pairwise relationships among all samples.
84137
The dimensions are named ``samples_0`` and ``samples_1``.
85138
86-
Warnings
87-
--------
88-
This function is only applicable to fixed-ploidy, biallelic datasets.
89-
90139
Raises
91140
------
92141
ValueError
@@ -98,6 +147,17 @@ def genomic_relationship(
98147
ValueError
99148
If ancestral_frequency is the incorrect shape.
100149
150+
Note
151+
----
152+
The shrinkage parameter for the Endelman-Jannink estimator depends upon
153+
the total number of variants in the dataset. Monomorphic variants need
154+
to be removed from the dataset in order for the resulting estimates to
155+
match those found using the default settings in rrBLUP [5].
156+
157+
Warnings
158+
--------
159+
This function is only applicable to fixed-ploidy, biallelic datasets.
160+
101161
Examples
102162
--------
103163
@@ -217,31 +277,45 @@ def genomic_relationship(
217277
"Developing statistical methods for genetic analysis of genotypes from
218278
genotyping-by-sequencing data"
219279
PhD thesis, University of Otago.
280+
281+
[4] - J. B. Endelman and J. -L. Jannink 2012.
282+
"Shrinkage Estimation of the Realized Relationship Matrix"
283+
G3 2: 1405-1413.
284+
285+
[5] - J. B. Endelman
286+
"Ridge regression and other kernels for genomic selection with R package"
287+
Plant Genome 4: 250-255.
220288
"""
221289
variables.validate(
222290
ds,
223291
{call_dosage: variables.call_dosage_spec},
224292
)
225293

226-
estimator = estimator or "VanRaden"
227-
if estimator not in {"VanRaden"}:
228-
raise ValueError("Unknown estimator '{}'".format(estimator))
294+
estimator = estimator or EST_VAN_RADEN
229295
# TODO: raise on mixed ploidy
230296
ploidy = ploidy or ds.dims.get("ploidy")
231297
if ploidy is None:
232298
raise ValueError("Ploidy must be specified when the ploidy dimension is absent")
233-
234-
# VanRaden GRM
235-
cd = da.array(ds[call_dosage].data)
236-
n_variants, _ = cd.shape
237-
if ancestral_frequency is None:
238-
raise ValueError("The 'VanRaden' estimator requires ancestral_frequency")
239-
af = da.array(ds[ancestral_frequency].data)
240-
if af.shape != (n_variants,):
241-
raise ValueError(
242-
"The ancestral_frequency variable must have one value per variant"
243-
)
244-
G = _grm_VanRaden(cd, af, ploidy=ploidy, skipna=skipna)
299+
dosage = da.array(ds[call_dosage].data)
300+
301+
# estimators requiring 'ancestral_frequency'
302+
if estimator in {EST_VAN_RADEN, EST_ENDELMAN_JANNINK}:
303+
n_variants, _ = dosage.shape
304+
if ancestral_frequency is None:
305+
raise ValueError(
306+
f"The '{estimator}' estimator requires the 'ancestral_frequency' argument"
307+
)
308+
af = da.array(ds[ancestral_frequency].data)
309+
if af.shape != (n_variants,):
310+
raise ValueError(
311+
"The ancestral_frequency variable must have one value per variant"
312+
)
313+
if estimator == EST_VAN_RADEN:
314+
G = _grm_VanRaden(dosage, af, ploidy=ploidy, skipna=skipna)
315+
elif estimator == EST_ENDELMAN_JANNINK:
316+
G = _grm_Endelman_Jannink(dosage, af, ploidy=ploidy, skipna=skipna)
317+
else:
318+
raise ValueError(f"Unknown estimator '{estimator}'")
245319

246320
new_ds = create_dataset(
247321
{
@@ -451,7 +525,7 @@ def hybrid_relationship(
451525
pedigree_relationship: Hashable = None,
452526
pedigree_subset_inverse_relationship: Hashable = None,
453527
genomic_sample: Optional[Hashable] = None,
454-
estimator: Optional[Literal["Martini"]] = None,
528+
estimator: Optional[Literal[EST_MARTINI]] = None, # type: ignore
455529
tau: float = 1.0,
456530
omega: float = 1.0,
457531
merge: bool = True,
@@ -585,9 +659,9 @@ def hybrid_relationship(
585659
Journal of Dairy Science 93 (2): 743-752.
586660
"""
587661
if estimator is None:
588-
estimator = "Martini"
589-
if estimator not in {"Martini"}:
590-
raise ValueError("Unknown estimator '{}'".format(estimator))
662+
estimator = EST_MARTINI
663+
if estimator not in {EST_MARTINI}:
664+
raise ValueError(f"Unknown estimator '{estimator}'")
591665
variables.validate(
592666
ds,
593667
{
@@ -697,7 +771,7 @@ def hybrid_inverse_relationship(
697771
pedigree_inverse_relationship: Hashable,
698772
pedigree_subset_inverse_relationship: Hashable = None,
699773
genomic_sample: Optional[Hashable] = None,
700-
estimator: Optional[Literal["Martini"]] = None,
774+
estimator: Optional[Literal[EST_MARTINI]] = None, # type: ignore
701775
tau: float = 1.0,
702776
omega: float = 1.0,
703777
merge: bool = True,
@@ -847,9 +921,9 @@ def hybrid_inverse_relationship(
847921
Journal of Dairy Science 93 (2): 743-752.
848922
"""
849923
if estimator is None:
850-
estimator = "Martini"
851-
if estimator not in {"Martini"}:
852-
raise ValueError("Unknown estimator '{}'".format(estimator))
924+
estimator = EST_MARTINI
925+
if estimator not in {EST_MARTINI}:
926+
raise ValueError(f"Unknown estimator '{estimator}'")
853927
variables.validate(
854928
ds,
855929
{

0 commit comments

Comments
 (0)