From 77e5d021862abc91f36424d3ed7069bc17aa1cd3 Mon Sep 17 00:00:00 2001 From: justinehansen Date: Mon, 12 Jul 2021 10:31:49 -0700 Subject: [PATCH 1/3] [ENH] Added distance-dependent cross validation --- netneurotools/stats.py | 100 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/netneurotools/stats.py b/netneurotools/stats.py index e3148fa..8993653 100644 --- a/netneurotools/stats.py +++ b/netneurotools/stats.py @@ -6,10 +6,12 @@ import warnings import numpy as np +import random from tqdm import tqdm from itertools import combinations from scipy import optimize, spatial, special, stats as sstats from scipy.stats.stats import _chk2_asarray +from scipy.spatial.distance import squareform, pdist from sklearn.utils.validation import check_random_state from sklearn.linear_model import LinearRegression @@ -925,3 +927,101 @@ def get_reg_r_sq(X, y): model_metrics["full_r_sq"] = model_r_sq[tuple(range(n_predictor))] return model_metrics, model_r_sq + + +def cv_distance_dependent(X, y, coords, train_pct=.75, nsplits=1000, + metric='rsq', use_adjusted_rsq=True): + ''' + Distance-dependent cross-validation of regression equation `y ~ X` + + Parameters + ---------- + X : (N[, R]) array_like + Coefficient matrix of `R` variables for `N` brain regions + y : (N,) array_like + Dependent variable vector for `N` brain regions + coords : (N, 3) array_like + Coordinate matrix for `N` brain regions + train_pct : float, optional + Percentage of brain regions in the training set. 0 < train_pct < 1 + Default: 75% + nsplits : float, optional + Number of train/test splits. Default: 1000 + metric : {'rsq', 'corr'}, optional + Metric of model assessment. 'rsq' will return the adjusted r-squared + of the train and test set model performance for each split. 'corr' will + return the correlation between predicted and empiracle observations + for the train and test set. Default: 'rsq' + use_adjusted_rsq : bool, optional + Whether to use adjusted r-squared. Only relevant if metric is 'rsq'. + Default: True + + Returns + ------- + train_metric : (nsplits,) list + List of length `nsplits` of performance metric on the training set. + test_metric : (nsplits,) list + List of length `nsplits` of performance metric on the test set. + ''' + + # sklearn linear regression wrapper + def get_reg_r_sq(X, y): + lin_reg = LinearRegression() + lin_reg.fit(X, y) + yhat = lin_reg.predict(X) + SS_Residual = sum((y - yhat) ** 2) + SS_Total = sum((y - np.mean(y)) ** 2) + r_squared = 1 - (float(SS_Residual)) / SS_Total + adjusted_r_squared = 1 - (1 - r_squared) * \ + (len(y) - 1) / (len(y) - X.shape[1] - 1) + if use_adjusted_rsq: + return adjusted_r_squared + else: + return r_squared + + P = squareform(pdist(coords, metric="euclidean")) + train_metric = [] + test_metric = [] + + for i in range(nsplits): + + # randomly chosen source node + sourceNode = random.choice(range(0, len(coords), 1)) + # distance from source node to all other nodes in network + distances = P[sourceNode, :] + idx = np.argsort(distances) + + # train_pct of nodes closest to source node comprise the training set + # the remaining (1 - train_pct) of nodes comprise the test set + train_idx = idx[:int(np.floor(train_pct * len(coords)))] + test_idx = idx[int(np.floor(train_pct * len(coords))):] + + # linear regression + mdl = LinearRegression() + mdl.fit(X[train_idx, :], y[train_idx]) + if metric == 'rsq': + # get r^2 of train set + train_metric.append(get_reg_r_sq(X[train_idx, :], y[train_idx])) + + elif metric == 'corr': + rho, _ = sstats.pearsonr(mdl.predict(X[train_idx, :]), + y[train_idx]) + train_metric.append(rho) + + # prediction on test set + yhat = mdl.predict(X[test_idx, :]) + if metric == 'rsq': + # get r^2 of test set + SS_Residual = sum((y[test_idx] - yhat) ** 2) + SS_Total = sum((y[test_idx] - np.mean(y[test_idx])) ** 2) + r_squared = 1 - (float(SS_Residual)) / SS_Total + adjusted_r_squared = 1-(1-r_squared)*((len(y[test_idx]) - 1) / + (len(y[test_idx]) - + X.shape[1]-1)) + test_metric.append(adjusted_r_squared) + + elif metric == 'corr': + rho, _ = sstats.pearsonr(yhat, y[test_idx]) + test_metric.append(rho) + + return train_metric, test_metric From 6fb2e3d7490f0a1023b069e32e2303177f5ededa Mon Sep 17 00:00:00 2001 From: justinehansen Date: Mon, 12 Jul 2021 10:55:27 -0700 Subject: [PATCH 2/3] [STY] Style fix --- netneurotools/stats.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/netneurotools/stats.py b/netneurotools/stats.py index 8993653..8b071db 100644 --- a/netneurotools/stats.py +++ b/netneurotools/stats.py @@ -1005,7 +1005,7 @@ def get_reg_r_sq(X, y): elif metric == 'corr': rho, _ = sstats.pearsonr(mdl.predict(X[train_idx, :]), - y[train_idx]) + y[train_idx]) train_metric.append(rho) # prediction on test set @@ -1015,9 +1015,9 @@ def get_reg_r_sq(X, y): SS_Residual = sum((y[test_idx] - yhat) ** 2) SS_Total = sum((y[test_idx] - np.mean(y[test_idx])) ** 2) r_squared = 1 - (float(SS_Residual)) / SS_Total - adjusted_r_squared = 1-(1-r_squared)*((len(y[test_idx]) - 1) / - (len(y[test_idx]) - - X.shape[1]-1)) + adjusted_r_squared = 1 - (1 - r_squared) * ((len(y[test_idx]) - 1) + / (len(y[test_idx]) + - X.shape[1] - 1)) test_metric.append(adjusted_r_squared) elif metric == 'corr': From f782c1bab4f65ee931a984fa928cf4ff02a644db Mon Sep 17 00:00:00 2001 From: Zhen-Qi Liu Date: Wed, 28 Sep 2022 13:03:07 -0400 Subject: [PATCH 3/3] [FIX] Fix imports --- netneurotools/stats.py | 1 + 1 file changed, 1 insertion(+) diff --git a/netneurotools/stats.py b/netneurotools/stats.py index d46f1f9..5726612 100644 --- a/netneurotools/stats.py +++ b/netneurotools/stats.py @@ -14,6 +14,7 @@ from scipy.stats._stats_py import _chk2_asarray except ImportError: # scipy < 1.8.0 from scipy.stats.stats import _chk2_asarray +from scipy.spatial.distance import squareform, pdist from sklearn.utils.validation import check_random_state from sklearn.linear_model import LinearRegression