Skip to content

Commit 7eeaafb

Browse files
author
Scott Sanderson
authored
Merge pull request #2071 from quantopian/speedup-pearson
PERF: Speedup RollingPearson
2 parents c825927 + cb61ba9 commit 7eeaafb

File tree

2 files changed

+167
-10
lines changed

2 files changed

+167
-10
lines changed

tests/pipeline/test_statistical.py

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@
3434
RollingSpearmanOfReturns,
3535
SimpleBeta,
3636
)
37-
from zipline.pipeline.factors.statistical import vectorized_beta
37+
from zipline.pipeline.factors.statistical import (
38+
vectorized_beta,
39+
vectorized_pearson_r,
40+
)
3841
from zipline.pipeline.loaders.frame import DataFrameLoader
3942
from zipline.pipeline.sentinels import NotSpecified
4043
from zipline.testing import (
@@ -1059,3 +1062,84 @@ def test_allowed_missing_doesnt_double_count(self):
10591062
result5 = vectorized_beta(dependents, independent, allowed_missing=5)
10601063
assert_equal(np.isnan(result5),
10611064
np.array([False, False, False, False, False]))
1065+
1066+
1067+
class VectorizedCorrelationTestCase(ZiplineTestCase):
1068+
1069+
def naive_columnwise_func(self, func, left, right):
1070+
out = np.empty_like(left[0])
1071+
self.assertEqual(left.shape, right.shape)
1072+
1073+
for col in range(left.shape[1]):
1074+
left_col = left[:, col]
1075+
right_col = right[:, col]
1076+
missing = np.isnan(left_col) | np.isnan(right_col)
1077+
left_col = left_col[~missing]
1078+
right_col = right_col[~missing]
1079+
r, pvalue = func(left_col, right_col)
1080+
out[col] = r
1081+
1082+
return out
1083+
1084+
def naive_columnwise_pearson(self, left, right):
1085+
return self.naive_columnwise_func(pearsonr, left, right)
1086+
1087+
def naive_columnwise_spearman(self, left, right):
1088+
return self.naive_columnwise_func(spearmanr, left, right)
1089+
1090+
@parameter_space(
1091+
seed=[1, 2, 42],
1092+
nan_offset=[-1, 0, 1],
1093+
nans=['dependent', 'independent', 'both'],
1094+
__fail_fast=True,
1095+
)
1096+
def test_produce_nans_when_too_much_missing_data(self,
1097+
seed,
1098+
nans,
1099+
nan_offset):
1100+
rand = np.random.RandomState(seed)
1101+
1102+
betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
1103+
independents = as_column(np.linspace(-5., 5., 30)) + np.arange(5)
1104+
noise = as_column(rand.uniform(-2, 2, 30))
1105+
dependents = 1.0 + betas * independents + noise
1106+
1107+
# Write nans in a triangular pattern into the middle of the dependent
1108+
# array.
1109+
nan_grid = np.array([[1, 1, 1, 1, 1],
1110+
[0, 1, 1, 1, 1],
1111+
[0, 0, 1, 1, 1],
1112+
[0, 0, 0, 1, 1],
1113+
[0, 0, 0, 0, 1]], dtype=bool)
1114+
1115+
if nans == 'dependent' or nans == 'both':
1116+
dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
1117+
if nans == 'independent' or nans == 'both':
1118+
independents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
1119+
1120+
expected = self.naive_columnwise_pearson(dependents, independents)
1121+
for allowed_missing in list(range(7)) + [10000]:
1122+
results = vectorized_pearson_r(
1123+
dependents, independents, allowed_missing
1124+
)
1125+
for i, result in enumerate(results):
1126+
# column i has i + 1 missing values.
1127+
if i + 1 > allowed_missing:
1128+
self.assertTrue(np.isnan(result))
1129+
else:
1130+
assert_equal(result, expected[i])
1131+
1132+
def test_broadcasting(self):
1133+
_independent = as_column(np.array([1, 2, 3, 4, 5]))
1134+
dependent = _independent * [2.5, 1.0, -3.5]
1135+
1136+
def do_check(independent):
1137+
result = vectorized_pearson_r(
1138+
dependent, independent, allowed_missing=0
1139+
)
1140+
assert_equal(result, np.array([1.0, 1.0, -1.0]))
1141+
1142+
# We should get the same result from passing a N x 1 array or an N x 3
1143+
# array with the column tiled 3 times.
1144+
do_check(_independent)
1145+
do_check(np.tile(_independent, 3))

zipline/pipeline/factors/statistical.py

Lines changed: 82 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1+
from numexpr import evaluate
12
import numpy as np
23
from numpy import broadcast_arrays
34
from scipy.stats import (
45
linregress,
5-
pearsonr,
66
spearmanr,
77
)
88

@@ -88,13 +88,12 @@ class RollingPearson(_RollingCorrelation):
8888
window_safe = True
8989

9090
def compute(self, today, assets, out, base_data, target_data):
91-
# If `target_data` is a Slice or single column of data, broadcast it
92-
# out to the same shape as `base_data`, then compute column-wise. This
93-
# is efficient because each column of the broadcasted array only refers
94-
# to a single memory location.
95-
target_data = broadcast_arrays(target_data, base_data)[0]
96-
for i in range(len(out)):
97-
out[i] = pearsonr(base_data[:, i], target_data[:, i])[0]
91+
vectorized_pearson_r(
92+
base_data,
93+
target_data,
94+
allowed_missing=0,
95+
out=out,
96+
)
9897

9998

10099
class RollingSpearman(_RollingCorrelation):
@@ -582,8 +581,11 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
582581
Independent variable of the regression
583582
allowed_missing : int
584583
Number of allowed missing (NaN) observations per column. Columns with
585-
more than this many non-nan observations in both ``dependents`` and
584+
more than this many non-nan observations in either ``dependents`` or
586585
``independents`` will output NaN as the regression coefficient.
586+
out : np.array[M] or None, optional
587+
Output array into which to write results. If None, a new array is
588+
created and returned.
587589
588590
Returns
589591
-------
@@ -663,3 +665,74 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
663665
out[nanlocs] = nan
664666

665667
return out
668+
669+
670+
def vectorized_pearson_r(dependents, independents, allowed_missing, out=None):
671+
"""
672+
Compute Pearson's r between columns of ``dependents`` and ``independents``.
673+
674+
Parameters
675+
----------
676+
dependents : np.array[N, M]
677+
Array with columns of data to be regressed against ``independent``.
678+
independents : np.array[N, M] or np.array[N, 1]
679+
Independent variable(s) of the regression. If a single column is
680+
passed, it is broadcast to the shape of ``dependents``.
681+
allowed_missing : int
682+
Number of allowed missing (NaN) observations per column. Columns with
683+
more than this many non-nan observations in either ``dependents`` or
684+
``independents`` will output NaN as the correlation coefficient.
685+
out : np.array[M] or None, optional
686+
Output array into which to write results. If None, a new array is
687+
created and returned.
688+
689+
Returns
690+
-------
691+
correlations : np.array[M]
692+
Pearson correlation coefficients for each column of ``dependents``.
693+
694+
See Also
695+
--------
696+
:class:`zipline.pipeline.factors.RollingPearson`
697+
:class:`zipline.pipeline.factors.RollingPearsonOfReturns`
698+
"""
699+
nan = np.nan
700+
isnan = np.isnan
701+
N, M = dependents.shape
702+
703+
if out is None:
704+
out = np.full(M, nan)
705+
706+
if allowed_missing > 0:
707+
# If we're handling nans robustly, we need to mask both arrays to
708+
# locations where either was nan.
709+
either_nan = isnan(dependents) | isnan(independents)
710+
independents = np.where(either_nan, nan, independents)
711+
dependents = np.where(either_nan, nan, dependents)
712+
mean = nanmean
713+
else:
714+
# Otherwise, we can just use mean, which will give us a nan for any
715+
# column where there's ever a nan.
716+
mean = np.mean
717+
718+
# Pearson R is Cov(X, Y) / StdDev(X) * StdDev(Y)
719+
# c.f. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
720+
ind_residual = independents - mean(independents, axis=0)
721+
dep_residual = dependents - mean(dependents, axis=0)
722+
723+
ind_variance = mean(ind_residual ** 2, axis=0)
724+
dep_variance = mean(dep_residual ** 2, axis=0)
725+
726+
covariances = mean(ind_residual * dep_residual, axis=0)
727+
728+
evaluate(
729+
'where(mask, nan, cov / sqrt(ind_variance * dep_variance))',
730+
local_dict={'cov': covariances,
731+
'mask': isnan(independents).sum(axis=0) > allowed_missing,
732+
'nan': np.nan,
733+
'ind_variance': ind_variance,
734+
'dep_variance': dep_variance},
735+
global_dict={},
736+
out=out,
737+
)
738+
return out

0 commit comments

Comments
 (0)