From 77e5d021862abc91f36424d3ed7069bc17aa1cd3 Mon Sep 17 00:00:00 2001
From: justinehansen <justine.hansen@mail.mcgill.ca>
Date: Mon, 12 Jul 2021 10:31:49 -0700
Subject: [PATCH 1/3] [ENH] Added distance-dependent cross validation

---
 netneurotools/stats.py | 100 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/netneurotools/stats.py b/netneurotools/stats.py
index e3148fa..8993653 100644
--- a/netneurotools/stats.py
+++ b/netneurotools/stats.py
@@ -6,10 +6,12 @@
 import warnings
 
 import numpy as np
+import random
 from tqdm import tqdm
 from itertools import combinations
 from scipy import optimize, spatial, special, stats as sstats
 from scipy.stats.stats import _chk2_asarray
+from scipy.spatial.distance import squareform, pdist
 from sklearn.utils.validation import check_random_state
 from sklearn.linear_model import LinearRegression
 
@@ -925,3 +927,101 @@ def get_reg_r_sq(X, y):
     model_metrics["full_r_sq"] = model_r_sq[tuple(range(n_predictor))]
 
     return model_metrics, model_r_sq
+
+
+def cv_distance_dependent(X, y, coords, train_pct=.75, nsplits=1000,
+                          metric='rsq', use_adjusted_rsq=True):
+    '''
+    Distance-dependent cross-validation of regression equation `y ~ X`
+
+    Parameters
+    ----------
+    X : (N[, R]) array_like
+        Coefficient matrix of `R` variables for `N` brain regions
+    y : (N,) array_like
+        Dependent variable vector for `N` brain regions
+    coords : (N, 3) array_like
+        Coordinate matrix for `N` brain regions
+    train_pct : float, optional
+        Percentage of brain regions in the training set. 0 < train_pct < 1
+        Default: 75%
+    nsplits : float, optional
+        Number of train/test splits. Default: 1000
+    metric : {'rsq', 'corr'}, optional
+        Metric of model assessment. 'rsq' will return the adjusted r-squared
+        of the train and test set model performance for each split. 'corr' will
+        return the correlation between predicted and empiracle observations
+        for the train and test set. Default: 'rsq'
+    use_adjusted_rsq : bool, optional
+        Whether to use adjusted r-squared. Only relevant if metric is 'rsq'.
+        Default: True
+
+    Returns
+    -------
+    train_metric : (nsplits,) list
+        List of length `nsplits` of performance metric on the training set.
+    test_metric : (nsplits,) list
+        List of length `nsplits` of performance metric on the test set.
+    '''
+
+    # sklearn linear regression wrapper
+    def get_reg_r_sq(X, y):
+        lin_reg = LinearRegression()
+        lin_reg.fit(X, y)
+        yhat = lin_reg.predict(X)
+        SS_Residual = sum((y - yhat) ** 2)
+        SS_Total = sum((y - np.mean(y)) ** 2)
+        r_squared = 1 - (float(SS_Residual)) / SS_Total
+        adjusted_r_squared = 1 - (1 - r_squared) * \
+            (len(y) - 1) / (len(y) - X.shape[1] - 1)
+        if use_adjusted_rsq:
+            return adjusted_r_squared
+        else:
+            return r_squared
+
+    P = squareform(pdist(coords, metric="euclidean"))
+    train_metric = []
+    test_metric = []
+
+    for i in range(nsplits):
+
+        # randomly chosen source node
+        sourceNode = random.choice(range(0, len(coords), 1))
+        # distance from source node to all other nodes in network
+        distances = P[sourceNode, :]
+        idx = np.argsort(distances)
+
+        # train_pct of nodes closest to source node comprise the training set
+        # the remaining (1 - train_pct) of nodes comprise the test set
+        train_idx = idx[:int(np.floor(train_pct * len(coords)))]
+        test_idx = idx[int(np.floor(train_pct * len(coords))):]
+
+        # linear regression
+        mdl = LinearRegression()
+        mdl.fit(X[train_idx, :], y[train_idx])
+        if metric == 'rsq':
+            # get r^2 of train set
+            train_metric.append(get_reg_r_sq(X[train_idx, :], y[train_idx]))
+
+        elif metric == 'corr':
+            rho, _ = sstats.pearsonr(mdl.predict(X[train_idx, :]),
+				     y[train_idx])
+            train_metric.append(rho)
+
+        # prediction on test set
+        yhat = mdl.predict(X[test_idx, :])
+        if metric == 'rsq':
+            # get r^2 of test set
+            SS_Residual = sum((y[test_idx] - yhat) ** 2)
+            SS_Total = sum((y[test_idx] - np.mean(y[test_idx])) ** 2)
+            r_squared = 1 - (float(SS_Residual)) / SS_Total
+            adjusted_r_squared = 1-(1-r_squared)*((len(y[test_idx]) - 1) /
+                                                  (len(y[test_idx]) -
+                                                   X.shape[1]-1))
+            test_metric.append(adjusted_r_squared)
+
+        elif metric == 'corr':
+            rho, _ = sstats.pearsonr(yhat, y[test_idx])
+            test_metric.append(rho)
+
+    return train_metric, test_metric

From 6fb2e3d7490f0a1023b069e32e2303177f5ededa Mon Sep 17 00:00:00 2001
From: justinehansen <justine.hansen@mail.mcgill.ca>
Date: Mon, 12 Jul 2021 10:55:27 -0700
Subject: [PATCH 2/3] [STY] Style fix

---
 netneurotools/stats.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/netneurotools/stats.py b/netneurotools/stats.py
index 8993653..8b071db 100644
--- a/netneurotools/stats.py
+++ b/netneurotools/stats.py
@@ -1005,7 +1005,7 @@ def get_reg_r_sq(X, y):
 
         elif metric == 'corr':
             rho, _ = sstats.pearsonr(mdl.predict(X[train_idx, :]),
-				     y[train_idx])
+                                     y[train_idx])
             train_metric.append(rho)
 
         # prediction on test set
@@ -1015,9 +1015,9 @@ def get_reg_r_sq(X, y):
             SS_Residual = sum((y[test_idx] - yhat) ** 2)
             SS_Total = sum((y[test_idx] - np.mean(y[test_idx])) ** 2)
             r_squared = 1 - (float(SS_Residual)) / SS_Total
-            adjusted_r_squared = 1-(1-r_squared)*((len(y[test_idx]) - 1) /
-                                                  (len(y[test_idx]) -
-                                                   X.shape[1]-1))
+            adjusted_r_squared = 1 - (1 - r_squared) * ((len(y[test_idx]) - 1)
+                                                        / (len(y[test_idx])
+                                                           - X.shape[1] - 1))
             test_metric.append(adjusted_r_squared)
 
         elif metric == 'corr':

From f782c1bab4f65ee931a984fa928cf4ff02a644db Mon Sep 17 00:00:00 2001
From: Zhen-Qi Liu <liuzhenqi77@gmail.com>
Date: Wed, 28 Sep 2022 13:03:07 -0400
Subject: [PATCH 3/3] [FIX] Fix imports

---
 netneurotools/stats.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/netneurotools/stats.py b/netneurotools/stats.py
index d46f1f9..5726612 100644
--- a/netneurotools/stats.py
+++ b/netneurotools/stats.py
@@ -14,6 +14,7 @@
     from scipy.stats._stats_py import _chk2_asarray
 except ImportError:  # scipy < 1.8.0
     from scipy.stats.stats import _chk2_asarray
+from scipy.spatial.distance import squareform, pdist
 from sklearn.utils.validation import check_random_state
 from sklearn.linear_model import LinearRegression