1
- from typing import Hashable , Optional
1
+ from typing import Hashable , Optional , Tuple
2
2
3
3
import dask .array as da
4
4
import numpy as np
9
9
from sgkit .typing import ArrayLike
10
10
from sgkit .utils import conditional_merge_datasets , create_dataset
11
11
12
+ EST_VAN_RADEN = "VanRaden"
13
+ EST_ENDELMAN_JANNINK = "Endelman-Jannink"
14
+ EST_MARTINI = "Martini"
15
+
12
16
13
17
def _grm_VanRaden (
14
18
call_dosage : ArrayLike ,
15
19
ancestral_frequency : ArrayLike ,
16
20
ploidy : int ,
17
21
skipna : bool = False ,
18
- ):
22
+ ) -> ArrayLike :
19
23
ancestral_dosage = ancestral_frequency * ploidy
20
24
M = call_dosage - ancestral_dosage [:, None ]
21
25
if skipna :
@@ -32,18 +36,69 @@ def _grm_VanRaden(
32
36
return G
33
37
34
38
39
+ def _shrinkage_Ledoit_Wolf (X : ArrayLike ) -> Tuple [ArrayLike , float ]:
40
+ # shrinkage estimator of Ledoit & Wolf 2004
41
+ # following the notation of Endelman & Jannink
42
+ # X is an m * n matrix with mean centered columns
43
+ m , n = X .shape
44
+ S = (X .T @ X ) / m
45
+ T = da .diag (S ).mean () * da .eye (n )
46
+ # calculate scaling parameter delta
47
+ X2 = X ** 2
48
+ Gamma = X2 .T @ X2 / m # E&J eq 25
49
+ delta_numerator = (Gamma - S ** 2 ).sum () / m # Error in E&J eq 24?
50
+ delta_denominator = ((S - T ) ** 2 ).sum () # squared Frobenius norm
51
+ delta = delta_numerator / delta_denominator # E&J eq 20
52
+ # delta must be in [0 1]
53
+ delta = da .maximum (delta , 0.0 )
54
+ delta = da .minimum (delta , 1.0 )
55
+ Cov = delta * T + (1 - delta ) * S # E&J eq 16
56
+ return Cov , delta
57
+
58
+
59
+ def _grm_Endelman_Jannink (
60
+ call_dosage : ArrayLike ,
61
+ ancestral_frequency : ArrayLike ,
62
+ ploidy : int ,
63
+ skipna : bool = False ,
64
+ ) -> ArrayLike :
65
+ if skipna :
66
+ raise NotImplementedError (
67
+ f"The 'skipna' option is not implemented for the '{ EST_ENDELMAN_JANNINK } ' estimator"
68
+ )
69
+ ancestral_dosage = ancestral_frequency * ploidy
70
+ W = call_dosage - ancestral_dosage [:, None ]
71
+ W_mean = da .nanmean (W , axis = 0 , keepdims = True )
72
+ X = W - W_mean # mean centered
73
+ Cov , _ = _shrinkage_Ledoit_Wolf (X )
74
+ # E&J eq 17
75
+ numerator = Cov + W_mean .T @ W_mean
76
+ denominator = ploidy * da .mean (ancestral_frequency * (1 - ancestral_frequency ))
77
+ G = numerator / denominator
78
+ return G
79
+
80
+
35
81
def genomic_relationship (
36
82
ds : Dataset ,
37
83
* ,
38
84
call_dosage : Hashable = variables .call_dosage ,
39
- estimator : Optional [Literal ["VanRaden" ]] = None ,
85
+ estimator : Optional [Literal [EST_VAN_RADEN , EST_ENDELMAN_JANNINK ]] = None , # type: ignore
40
86
ancestral_frequency : Optional [Hashable ] = None ,
41
87
ploidy : Optional [int ] = None ,
42
88
skipna : bool = False ,
43
89
merge : bool = True ,
44
90
) -> Dataset :
45
91
"""Compute a genomic relationship matrix (AKA the GRM or G-matrix).
46
92
93
+
94
+ The following estimators are supported:
95
+
96
+ * **VanRaden**: the first estimator described by VanRaden 2008 [1] and
97
+ generalized to autopolyploids by Ashraf et al 2016 [2] and Bilton 2020 [3].
98
+ * **Endelman-Jannink**: a shrinkage estimator described by Endelman and
99
+ Jannick 2012 [4]. This is based on the VanRaden estimator and aims to
100
+ improve the accuracy of estimated breeding values with low-density markers.
101
+
47
102
Parameters
48
103
----------
49
104
ds
@@ -52,10 +107,8 @@ def genomic_relationship(
52
107
Input variable name holding call_dosage as defined by
53
108
:data:`sgkit.variables.call_dosage_spec`.
54
109
estimator
55
- Specifies a relatedness estimator to use. Currently the only option
56
- is ``"VanRaden"`` which uses the method described by VanRaden 2008 [1]
57
- and generalized to autopolyploids by Ashraf et al 2016 [2] and
58
- Bilton 2020 [3].
110
+ Specifies the relatedness estimator to use. Must be one of
111
+ ``'VanRaden'`` (the default) or ``'Endelman-Jannink'``.
59
112
ancestral_frequency
60
113
Frequency of variant alleles corresponding to call_dosage within
61
114
the ancestral/base/reference population.
@@ -83,10 +136,6 @@ def genomic_relationship(
83
136
which is a matrix of pairwise relationships among all samples.
84
137
The dimensions are named ``samples_0`` and ``samples_1``.
85
138
86
- Warnings
87
- --------
88
- This function is only applicable to fixed-ploidy, biallelic datasets.
89
-
90
139
Raises
91
140
------
92
141
ValueError
@@ -98,6 +147,17 @@ def genomic_relationship(
98
147
ValueError
99
148
If ancestral_frequency is the incorrect shape.
100
149
150
+ Note
151
+ ----
152
+ The shrinkage parameter for the Endelman-Jannink estimator depends upon
153
+ the total number of variants in the dataset. Monomorphic variants need
154
+ to be removed from the dataset in order for the resulting estimates to
155
+ match those found using the default settings in rrBLUP [5].
156
+
157
+ Warnings
158
+ --------
159
+ This function is only applicable to fixed-ploidy, biallelic datasets.
160
+
101
161
Examples
102
162
--------
103
163
@@ -217,31 +277,45 @@ def genomic_relationship(
217
277
"Developing statistical methods for genetic analysis of genotypes from
218
278
genotyping-by-sequencing data"
219
279
PhD thesis, University of Otago.
280
+
281
+ [4] - J. B. Endelman and J. -L. Jannink 2012.
282
+ "Shrinkage Estimation of the Realized Relationship Matrix"
283
+ G3 2: 1405-1413.
284
+
285
+ [5] - J. B. Endelman
286
+ "Ridge regression and other kernels for genomic selection with R package"
287
+ Plant Genome 4: 250-255.
220
288
"""
221
289
variables .validate (
222
290
ds ,
223
291
{call_dosage : variables .call_dosage_spec },
224
292
)
225
293
226
- estimator = estimator or "VanRaden"
227
- if estimator not in {"VanRaden" }:
228
- raise ValueError ("Unknown estimator '{}'" .format (estimator ))
294
+ estimator = estimator or EST_VAN_RADEN
229
295
# TODO: raise on mixed ploidy
230
296
ploidy = ploidy or ds .dims .get ("ploidy" )
231
297
if ploidy is None :
232
298
raise ValueError ("Ploidy must be specified when the ploidy dimension is absent" )
233
-
234
- # VanRaden GRM
235
- cd = da .array (ds [call_dosage ].data )
236
- n_variants , _ = cd .shape
237
- if ancestral_frequency is None :
238
- raise ValueError ("The 'VanRaden' estimator requires ancestral_frequency" )
239
- af = da .array (ds [ancestral_frequency ].data )
240
- if af .shape != (n_variants ,):
241
- raise ValueError (
242
- "The ancestral_frequency variable must have one value per variant"
243
- )
244
- G = _grm_VanRaden (cd , af , ploidy = ploidy , skipna = skipna )
299
+ dosage = da .array (ds [call_dosage ].data )
300
+
301
+ # estimators requiring 'ancestral_frequency'
302
+ if estimator in {EST_VAN_RADEN , EST_ENDELMAN_JANNINK }:
303
+ n_variants , _ = dosage .shape
304
+ if ancestral_frequency is None :
305
+ raise ValueError (
306
+ f"The '{ estimator } ' estimator requires the 'ancestral_frequency' argument"
307
+ )
308
+ af = da .array (ds [ancestral_frequency ].data )
309
+ if af .shape != (n_variants ,):
310
+ raise ValueError (
311
+ "The ancestral_frequency variable must have one value per variant"
312
+ )
313
+ if estimator == EST_VAN_RADEN :
314
+ G = _grm_VanRaden (dosage , af , ploidy = ploidy , skipna = skipna )
315
+ elif estimator == EST_ENDELMAN_JANNINK :
316
+ G = _grm_Endelman_Jannink (dosage , af , ploidy = ploidy , skipna = skipna )
317
+ else :
318
+ raise ValueError (f"Unknown estimator '{ estimator } '" )
245
319
246
320
new_ds = create_dataset (
247
321
{
@@ -451,7 +525,7 @@ def hybrid_relationship(
451
525
pedigree_relationship : Hashable = None ,
452
526
pedigree_subset_inverse_relationship : Hashable = None ,
453
527
genomic_sample : Optional [Hashable ] = None ,
454
- estimator : Optional [Literal ["Martini" ]] = None ,
528
+ estimator : Optional [Literal [EST_MARTINI ]] = None , # type: ignore
455
529
tau : float = 1.0 ,
456
530
omega : float = 1.0 ,
457
531
merge : bool = True ,
@@ -585,9 +659,9 @@ def hybrid_relationship(
585
659
Journal of Dairy Science 93 (2): 743-752.
586
660
"""
587
661
if estimator is None :
588
- estimator = "Martini"
589
- if estimator not in {"Martini" }:
590
- raise ValueError ("Unknown estimator '{}'" . format ( estimator ) )
662
+ estimator = EST_MARTINI
663
+ if estimator not in {EST_MARTINI }:
664
+ raise ValueError (f "Unknown estimator '{ estimator } '" )
591
665
variables .validate (
592
666
ds ,
593
667
{
@@ -697,7 +771,7 @@ def hybrid_inverse_relationship(
697
771
pedigree_inverse_relationship : Hashable ,
698
772
pedigree_subset_inverse_relationship : Hashable = None ,
699
773
genomic_sample : Optional [Hashable ] = None ,
700
- estimator : Optional [Literal ["Martini" ]] = None ,
774
+ estimator : Optional [Literal [EST_MARTINI ]] = None , # type: ignore
701
775
tau : float = 1.0 ,
702
776
omega : float = 1.0 ,
703
777
merge : bool = True ,
@@ -847,9 +921,9 @@ def hybrid_inverse_relationship(
847
921
Journal of Dairy Science 93 (2): 743-752.
848
922
"""
849
923
if estimator is None :
850
- estimator = "Martini"
851
- if estimator not in {"Martini" }:
852
- raise ValueError ("Unknown estimator '{}'" . format ( estimator ) )
924
+ estimator = EST_MARTINI
925
+ if estimator not in {EST_MARTINI }:
926
+ raise ValueError (f "Unknown estimator '{ estimator } '" )
853
927
variables .validate (
854
928
ds ,
855
929
{
0 commit comments