Skip to content

Commit 5d7c3bc

Browse files
committed
add Blom K stat
1 parent cd30f03 commit 5d7c3bc

File tree

5 files changed

+404
-9
lines changed

5 files changed

+404
-9
lines changed

toytree/pcm/src/api.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python
22

3-
"""
3+
"""DEPRECATED
4+
45
Phylogenetic comparative methods functions made accessible at
56
:mod:`toytree.pcm` as a subpackage-level API.
67
"""

toytree/pcm/src/traits/__init__.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
55
"""
66

7-
from toytree.pcm.src.traits.discrete_markov_model_sim import (
8-
get_markov_model,
9-
simulate_discrete_data,
10-
)
11-
12-
from toytree.pcm.src.traits.continuous_brownian_sim import (
13-
simulate_continuous_brownian,
14-
)
7+
# from toytree.pcm.src.traits.discrete_markov_model_sim import (
8+
# get_markov_model,
9+
# simulate_discrete_data,
10+
# )
11+
12+
# from toytree.pcm.src.traits.continuous_brownian_sim import (
13+
# simulate_continuous_brownian,
14+
# )

toytree/pcm/src/traits/continuous_brownian_sim.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
from toytree.utils import ToytreeError
1212
from toytree.core.apis import add_subpackage_method, PhyloCompAPI
1313

14+
__all__ = [
15+
"simulate_continuous_brownian",
16+
]
17+
1418

1519
def simulate_continuous_multivariate_brownian(
1620
tree: ToyTree,
Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
#!/usr/bin/env python
2+
3+
"""Metrics to calculate phylogenetic signal in trait values.
4+
5+
Blomberg's K is used to quantify phylogenetic signal relative in trait
6+
evolution relative to a Brownian motion model. Values of K>1 indicate
7+
samples are less similar than expected, whereas K<1 indicates that they
8+
are more similar than expected. Permutations can be used to perform
9+
a significance test.
10+
11+
Example
12+
-------
13+
>>> tree = toytree.rtree.unittree(ntips=24, seed=123)
14+
>>> trait = tree.pcm.simulate_continuous_brownian([1.0], tips_only=True, seed=123)
15+
>>> kstat = phylogenetic_signal_k(tree=tree, data=trait, test=True)
16+
>>> # {'K': 0.9857885, 'P-value': 0.002, 'permutations': 1000}
17+
18+
References
19+
----------
20+
The original description of the K statistic:
21+
__Blomberg, S. P., T. Garland Jr., and A. R. Ives (2003) Testing for
22+
phylogenetic signal in comparative data: Behavioral traits are more
23+
labile. Evolution, *57*, 717-745.__
24+
25+
Extension to conduct hypothesis tests and incorporate sampling error:
26+
__Ives, A. R., P. E. Midford, and T. Garland Jr. (2007) Within-species
27+
variation and measurement error in phylogenetic comparative biology.
28+
Systematic Biology, *56*, 252-270.__
29+
30+
Extension to multivariate measure of K:
31+
__Philipp Mitteroecker, Michael L Collyer, Dean C Adams, Exploring Phylogenetic
32+
Signal in Multivariate Phenotypes by Maximizing Blomberg’s K, Systematic
33+
Biology, 2024;, syae035, https://doi.org/10.1093/sysbio/syae035__
34+
35+
"""
36+
37+
from typing import Union, Sequence, TypeAlias
38+
import numpy as np
39+
import pandas as pd
40+
from toytree.core import ToyTree
41+
from toytree.pcm import get_vcv_matrix_from_tree
42+
from scipy.optimize import minimize_scalar
43+
from loguru import logger
44+
45+
46+
logger = logger.bind(name="toytree")
47+
feature: TypeAlias = Union[str, Sequence[float], pd.Series, pd.DataFrame]
48+
__all__ = ["phylogenetic_signal_k"]
49+
50+
51+
def _validate_features(x: feature, max_dim: int, size: int) -> np.ndarray:
52+
"""Validate data has correct dimensions and size."""
53+
# if DataFrame w/ only 1 column convert to Series
54+
if isinstance(x, pd.DataFrame):
55+
if x.shape[1] == 1:
56+
x = x.iloc[:, 0]
57+
# force to array
58+
x = np.asarray(x)
59+
# check dimensions and size
60+
assert x.ndim <= max_dim, f"feature ndim ({x.ndim}) exceeds max allowed ndim ({max_dim})."
61+
assert x.shape[0] == size, "feature cannot exceed ntips"
62+
return x
63+
64+
65+
def phylogenetic_signal_k(
66+
tree: ToyTree,
67+
data: Union[str, Sequence[float]],
68+
error: Union[str, Sequence[float]] = None,
69+
test: bool = False,
70+
permutations: int = 1000,
71+
) -> dict[str, float]:
72+
"""Return Blomberg's K measurement of phylogenetic signal.
73+
74+
The K statistic is standardized by the expectation under the
75+
Brownian motion model of evolution so that K<1 indicates that
76+
relatives resemble each other less than expected under the model
77+
(perhaps due to adaptive evolution); whereas K>1 indicates that
78+
close relatives are more similar than expected under the model.
79+
80+
Parameters
81+
----------
82+
tree: ToyTree
83+
A tree with edge lengths.
84+
data: str | Sequence[float]
85+
Continuous trait values.
86+
error: str | Sequence[float]
87+
Optional standard errors measured on trait values.
88+
test: bool
89+
Perform permutation test for significance.
90+
permutations: int
91+
Number of permutations to perform if testing significance.
92+
93+
Returns
94+
-------
95+
dict[str, float]
96+
A dict with keys: ["K", "P-value", "permutations"]. It can also
97+
include additional items if standard errors are included which
98+
will estimate the variance parameter "sig2".
99+
100+
Example
101+
-------
102+
>>> tree = toytree.rtree.unittree(ntips=10, seed=123, treeheight=2.0)
103+
>>> data = tree.pcm.simulate_continuous_brownian(1.0, tips_only=True)
104+
>>> tree.pcm.phylogenetic_signal_k(tree, data)
105+
>>> # {"K": ..., "P-value": ..., ...}
106+
"""
107+
if error is None:
108+
return _phylogenetic_signal_k(tree, data, None, test, permutations)
109+
else:
110+
logger.warning("IN DEVELOPMENT")
111+
return _phylogenetic_signal_k_with_se(tree, data, error, test, permutations)
112+
113+
114+
115+
def _phylogenetic_signal_k(
116+
tree: ToyTree,
117+
data: Union[str, Sequence[float]],
118+
error: Union[str, Sequence[float]] = None,
119+
test: bool = False,
120+
permutations: int = 1000,
121+
) -> dict[str, float]:
122+
"""Return Blomberg's K measurement of phylogenetic signal.
123+
124+
See docstring in `phylogenetic_signal_k`.
125+
"""
126+
# get ntips and the variance-covariance matrix from tree
127+
ntips = tree.ntips
128+
V = get_vcv_matrix_from_tree(tree)
129+
130+
# [optional] get data as features from the tree
131+
if isinstance(data, str):
132+
data = tree.get_node_data(data)[:ntips]
133+
if isinstance(error, str):
134+
error = tree.get_node_data(error)[:ntips]
135+
if error is None:
136+
error = np.repeat(np.nan, ntips)
137+
138+
# validate proper trait format returned as float array
139+
x = _validate_features(data, max_dim=1, size=tree.ntips)
140+
141+
# calculate K statistic
142+
kstat = _calculate_k(x, V)
143+
144+
# [optional] permutation test
145+
if test:
146+
pval = _permutation_test_k(permutations, x, V, kstat)
147+
return {"K": kstat, "P-value": pval, "permutations": permutations}
148+
return {"K": kstat, "P-value": np.nan, "permutations": 0}
149+
150+
151+
def _calculate_k(x, V, IV = None) -> float:
152+
"""Return K statistic calculated for data x and variance-covariance
153+
matrix V."""
154+
# compute PGLS mean (root state)
155+
n = x.size
156+
IV = IV if IV is not None else np.linalg.inv(V)
157+
a = np.sum(IV @ x) / np.sum(IV)
158+
159+
# calculate K statistic
160+
num = ((x - a).T @ (x - a) / ((x - a).T @ IV @ (x - a)))
161+
dnm = ((np.sum(V.diagonal()) - n / np.sum(IV)) / (n - 1))
162+
return num / dnm
163+
164+
165+
def _permutation_test_k(size: int, x: np.ndarray, V: np.ndarray, k: float):
166+
"""Return p-value from permutations as a test statistic.
167+
"""
168+
kstats = np.zeros(size)
169+
rng = np.random.default_rng()
170+
for i in range(size):
171+
_x = rng.choice(x, size=x.size, replace=False)
172+
kstats[i] = _calculate_k(_x, V)
173+
174+
# the proportion of permutations w/ k_ > k
175+
return np.sum(kstats >= k) / kstats.size
176+
177+
178+
def _calculate_k_with_se(x: np.ndarray, V: np.ndarray, error: np.ndarray) -> tuple[float, float]:
179+
"""
180+
"""
181+
# start using no error vcv
182+
IV = np.linalg.inv(V)
183+
a = np.sum(IV @ x) / np.sum(IV)
184+
n = x.size
185+
E = np.diag(error ** 2)
186+
187+
# constrain optimization by setting a max on sigma2
188+
term = x - a
189+
max_sig2 = (term.T @ IV @ term) / n
190+
191+
# maximum likelihood model fitting
192+
estimate = minimize_scalar(
193+
_likelihood,
194+
args=(V, E, x),
195+
bounds=(1e-6, max_sig2),
196+
method="bounded",
197+
)
198+
model_fit = {
199+
"optimum": estimate.x,
200+
"LogLik": -estimate.fun,
201+
"convergence": estimate.success,
202+
}
203+
# logger.debug(f"\n{pd.Series(model_fit)}")
204+
205+
# get rate parameter
206+
sig2 = model_fit["optimum"] * (n / (n - 1))
207+
208+
# get VCV w/ rate scalar
209+
Ve = sig2 * V + E
210+
211+
# calculate K using optimized Ve
212+
IVe = np.linalg.inv(Ve)
213+
a = np.sum(IVe @ x) / np.sum(IVe)
214+
num = ((x - a).T @ (x - a) / ((x - a).T @ IVe @ (x - a)))
215+
dnm = ((np.sum(Ve.diagonal()) - n / np.sum(IVe)) / (n - 1))
216+
return num / dnm, sig2, model_fit["LogLik"]
217+
218+
219+
def _permutation_test_k_with_se(size: int, x: np.ndarray, V: np.ndarray, error: np.ndarray, kstat: float):
220+
"""Return p-value from permutations as a test statistic.
221+
"""
222+
kstats = np.zeros(size)
223+
rng = np.random.default_rng()
224+
for i in range(size):
225+
order = rng.choice(range(x.size), size=x.size, replace=False)
226+
_x = x[order]
227+
_e = error[order]
228+
_E = np.diag(_e ** 2)
229+
kstats[i], _, _ = _calculate_k_with_se(_x, V, _E)
230+
231+
# the proportion of permutations w/ k_ > k
232+
return np.sum(kstats >= kstat) / kstats.size
233+
234+
235+
def _phylogenetic_signal_k_with_se(
236+
tree: ToyTree,
237+
data: Union[str, Sequence[float]],
238+
error: Union[str, Sequence[float]] = None,
239+
test: bool = False,
240+
permutations: int = 1000,
241+
) -> dict[str, float]:
242+
"""Calculate phylogenetic signal (K) with measurement error.
243+
244+
This involves fitting a ML model to estimate the rate ...
245+
"""
246+
# get ntips and the variance-covariance matrix from tree
247+
ntips = tree.ntips
248+
V = get_vcv_matrix_from_tree(tree)
249+
250+
# [optional] get data as features from the tree
251+
if isinstance(data, str):
252+
data = tree.get_node_data(data)[:ntips]
253+
if isinstance(error, str):
254+
error = tree.get_node_data(error)[:ntips]
255+
if error is None:
256+
error = np.repeat(0.0, ntips)
257+
258+
# validate proper trait format returned as float array
259+
x = _validate_features(data, max_dim=1, size=ntips)
260+
261+
# calculate K stat w/ error
262+
kstat, sig2, loglik = _calculate_k_with_se(x, V, error)
263+
264+
# [optional] permutation test statistic
265+
if permutations:
266+
pval = _permutation_test_k_with_se(permutations, x, V, error, kstat)
267+
268+
# return as a dict
269+
return {
270+
"K": kstat,
271+
"P-value": pval,
272+
"permutations": 0 if not test else permutations,
273+
"log-likelihood": -loglik,
274+
"sig2": sig2,
275+
}
276+
277+
278+
def _likelihood(theta: float, V: np.ndarray, E: np.ndarray, y: np.ndarray) -> float:
279+
"""Estimate theta by maximizing the likelihood.
280+
"""
281+
# weight variances by theta and add Error variance
282+
C = theta * V + E
283+
284+
# get pgls mean
285+
IC = np.linalg.inv(C)
286+
n = len(y)
287+
a = np.sum(IC @ y) / np.sum(IC)
288+
289+
# get log determinant of variance covariance matrix
290+
det = np.linalg.det(C)
291+
if det <= 0:
292+
logdet2 = np.log(1e-12)
293+
else:
294+
logdet2 = np.log(det) / 2.
295+
296+
# compute log likelihood
297+
term = (y - a)
298+
logL = (
299+
-term.T @ IC @ term / 2. - n * np.log(2 * np.pi) / 2. - logdet2
300+
)
301+
# print(theta, -logL)
302+
return -logL
303+
304+
305+
306+
307+
if __name__ == "__main__":
308+
309+
import toytree
310+
toytree.set_log_level("DEBUG")
311+
312+
# generate test data
313+
tree = toytree.rtree.unittree(ntips=24, seed=123)
314+
tree.pcm.simulate_continuous_brownian(
315+
rates=[1.0, 0.1], tips_only=True, seed=123, inplace=True
316+
)
317+
318+
# get K
319+
k = _phylogenetic_signal_k(tree=tree, data="t0", test=True)
320+
logger.info(k)
321+
322+
# get K w/ Error
323+
k = _phylogenetic_signal_k_with_se(tree=tree, data="t0", error="t1", test=True)
324+
logger.info(k)

0 commit comments

Comments
 (0)