Merge pull request #2071 from quantopian/speedup-pearson

Scott Sanderson · web-flow · commit 7eeaafbc690b · 2020-01-21T10:57:37.000-05:00
PERF: Speedup RollingPearson
diff --git a/tests/pipeline/test_statistical.py b/tests/pipeline/test_statistical.py
@@ -34,7 +34,10 @@
     RollingSpearmanOfReturns,
     SimpleBeta,
 )
-from zipline.pipeline.factors.statistical import vectorized_beta
+from zipline.pipeline.factors.statistical import (
+    vectorized_beta,
+    vectorized_pearson_r,
+)
 from zipline.pipeline.loaders.frame import DataFrameLoader
 from zipline.pipeline.sentinels import NotSpecified
 from zipline.testing import (
@@ -1059,3 +1062,84 @@ def test_allowed_missing_doesnt_double_count(self):
         result5 = vectorized_beta(dependents, independent, allowed_missing=5)
         assert_equal(np.isnan(result5),
                      np.array([False, False, False, False, False]))
+
+
+class VectorizedCorrelationTestCase(ZiplineTestCase):
+
+    def naive_columnwise_func(self, func, left, right):
+        out = np.empty_like(left[0])
+        self.assertEqual(left.shape, right.shape)
+
+        for col in range(left.shape[1]):
+            left_col = left[:, col]
+            right_col = right[:, col]
+            missing = np.isnan(left_col) | np.isnan(right_col)
+            left_col = left_col[~missing]
+            right_col = right_col[~missing]
+            r, pvalue = func(left_col, right_col)
+            out[col] = r
+
+        return out
+
+    def naive_columnwise_pearson(self, left, right):
+        return self.naive_columnwise_func(pearsonr, left, right)
+
+    def naive_columnwise_spearman(self, left, right):
+        return self.naive_columnwise_func(spearmanr, left, right)
+
+    @parameter_space(
+        seed=[1, 2, 42],
+        nan_offset=[-1, 0, 1],
+        nans=['dependent', 'independent', 'both'],
+        __fail_fast=True,
+    )
+    def test_produce_nans_when_too_much_missing_data(self,
+                                                     seed,
+                                                     nans,
+                                                     nan_offset):
+        rand = np.random.RandomState(seed)
+
+        betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
+        independents = as_column(np.linspace(-5., 5., 30)) + np.arange(5)
+        noise = as_column(rand.uniform(-2, 2, 30))
+        dependents = 1.0 + betas * independents + noise
+
+        # Write nans in a triangular pattern into the middle of the dependent
+        # array.
+        nan_grid = np.array([[1, 1, 1, 1, 1],
+                             [0, 1, 1, 1, 1],
+                             [0, 0, 1, 1, 1],
+                             [0, 0, 0, 1, 1],
+                             [0, 0, 0, 0, 1]], dtype=bool)
+
+        if nans == 'dependent' or nans == 'both':
+            dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
+        if nans == 'independent' or nans == 'both':
+            independents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
+
+        expected = self.naive_columnwise_pearson(dependents, independents)
+        for allowed_missing in list(range(7)) + [10000]:
+            results = vectorized_pearson_r(
+                dependents, independents, allowed_missing
+            )
+            for i, result in enumerate(results):
+                # column i has i + 1 missing values.
+                if i + 1 > allowed_missing:
+                    self.assertTrue(np.isnan(result))
+                else:
+                    assert_equal(result, expected[i])
+
+    def test_broadcasting(self):
+        _independent = as_column(np.array([1, 2, 3, 4, 5]))
+        dependent = _independent * [2.5, 1.0, -3.5]
+
+        def do_check(independent):
+            result = vectorized_pearson_r(
+                dependent, independent, allowed_missing=0
+            )
+            assert_equal(result, np.array([1.0, 1.0, -1.0]))
+
+        # We should get the same result from passing a N x 1 array or an N x 3
+        # array with the column tiled 3 times.
+        do_check(_independent)
+        do_check(np.tile(_independent, 3))
diff --git a/zipline/pipeline/factors/statistical.py b/zipline/pipeline/factors/statistical.py
@@ -1,8 +1,8 @@
+from numexpr import evaluate
 import numpy as np
 from numpy import broadcast_arrays
 from scipy.stats import (
     linregress,
-    pearsonr,
     spearmanr,
 )
 
@@ -88,13 +88,12 @@ class RollingPearson(_RollingCorrelation):
     window_safe = True
 
     def compute(self, today, assets, out, base_data, target_data):
-        # If `target_data` is a Slice or single column of data, broadcast it
-        # out to the same shape as `base_data`, then compute column-wise. This
-        # is efficient because each column of the broadcasted array only refers
-        # to a single memory location.
-        target_data = broadcast_arrays(target_data, base_data)[0]
-        for i in range(len(out)):
-            out[i] = pearsonr(base_data[:, i], target_data[:, i])[0]
+        vectorized_pearson_r(
+            base_data,
+            target_data,
+            allowed_missing=0,
+            out=out,
+        )
 
 
 class RollingSpearman(_RollingCorrelation):
@@ -582,8 +581,11 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
         Independent variable of the regression
     allowed_missing : int
         Number of allowed missing (NaN) observations per column. Columns with
-        more than this many non-nan observations in both ``dependents`` and
+        more than this many non-nan observations in either ``dependents`` or
         ``independents`` will output NaN as the regression coefficient.
+    out : np.array[M] or None, optional
+        Output array into which to write results.  If None, a new array is
+        created and returned.
 
     Returns
     -------
@@ -663,3 +665,74 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
     out[nanlocs] = nan
 
     return out
+
+
+def vectorized_pearson_r(dependents, independents, allowed_missing, out=None):
+    """
+    Compute Pearson's r between columns of ``dependents`` and ``independents``.
+
+    Parameters
+    ----------
+    dependents : np.array[N, M]
+        Array with columns of data to be regressed against ``independent``.
+    independents : np.array[N, M] or np.array[N, 1]
+        Independent variable(s) of the regression. If a single column is
+        passed, it is broadcast to the shape of ``dependents``.
+    allowed_missing : int
+        Number of allowed missing (NaN) observations per column. Columns with
+        more than this many non-nan observations in either ``dependents`` or
+        ``independents`` will output NaN as the correlation coefficient.
+    out : np.array[M] or None, optional
+        Output array into which to write results.  If None, a new array is
+        created and returned.
+
+    Returns
+    -------
+    correlations : np.array[M]
+        Pearson correlation coefficients for each column of ``dependents``.
+
+    See Also
+    --------
+    :class:`zipline.pipeline.factors.RollingPearson`
+    :class:`zipline.pipeline.factors.RollingPearsonOfReturns`
+    """
+    nan = np.nan
+    isnan = np.isnan
+    N, M = dependents.shape
+
+    if out is None:
+        out = np.full(M, nan)
+
+    if allowed_missing > 0:
+        # If we're handling nans robustly, we need to mask both arrays to
+        # locations where either was nan.
+        either_nan = isnan(dependents) | isnan(independents)
+        independents = np.where(either_nan, nan, independents)
+        dependents = np.where(either_nan, nan, dependents)
+        mean = nanmean
+    else:
+        # Otherwise, we can just use mean, which will give us a nan for any
+        # column where there's ever a nan.
+        mean = np.mean
+
+    # Pearson R is Cov(X, Y) / StdDev(X) * StdDev(Y)
+    # c.f. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
+    ind_residual = independents - mean(independents, axis=0)
+    dep_residual = dependents - mean(dependents, axis=0)
+
+    ind_variance = mean(ind_residual ** 2, axis=0)
+    dep_variance = mean(dep_residual ** 2, axis=0)
+
+    covariances = mean(ind_residual * dep_residual, axis=0)
+
+    evaluate(
+        'where(mask, nan, cov / sqrt(ind_variance * dep_variance))',
+        local_dict={'cov': covariances,
+                    'mask': isnan(independents).sum(axis=0) > allowed_missing,
+                    'nan': np.nan,
+                    'ind_variance': ind_variance,
+                    'dep_variance': dep_variance},
+        global_dict={},
+        out=out,
+    )
+    return out