-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmetrics.py
More file actions
120 lines (91 loc) · 4.56 KB
/
metrics.py
File metadata and controls
120 lines (91 loc) · 4.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
regression_metrics = ['Equation', 'R2 Score', 'RMSE', 'MSE', 'MAE', 'Chi^2', 'RSE']
classification_metrics = ['Equation', 'Recall', 'Precision', 'F1 Score', 'Accuracy']
soft_margin_svc = {
'summary': 'Find the optimal hyperplane which maximally separates two classes of data.',
'mathjax': '\(\min \\frac{1}{2} ||w^{2}|| + C\sum{\max{(0,1-y_i(w^Tx_i+b))}}\)',
'assumptions': '1. The dependent variable is 1 or -1\n \
2. The Data is Partially Separable\n \
3. Misclassification is allowed within the margin to prevent overfitting, controlled by hyperparameter C',
'complexity': '\(Train: O(n^2)\\\Test: O(n*m) \\\Space: O(n*m)\)',
'info-href': 'https://en.wikipedia.org/wiki/Support_vector_machine',
'info-text': 'https://en.wikipedia.org/wiki/Support_vector_machine'
}
linear_regression = {
'summary': 'Model the linear relationship between two variables by finding the line of best fit to the data',
'mathjax': '\(\min\\beta: \\frac{1}{N} \sum{(y_i-(\\beta_0+\\beta_1x_i)^2}\)',
'assumptions': '1. Linear relationship between variables\n \
2. Independent error terms\n \
3. Constant variance of errors\n \
4. Errors follow a normal distribution\n \
5. Independent variables are not highly correlated',
'complexity': '\(Train: O(n*m^2+m^3)\\\Test: O(m) \\\Space: O(m)\)',
'info-href': 'https://en.wikipedia.org/wiki/Linear_regression',
'info-text': 'https://en.wikipedia.org/wiki/Linear_regression'
}
logistic_regression = {
'summary': 'Model the probability of a binary outcome (0 or 1) based on one or more predictor variables',
'mathjax': '\(\max\\beta: \sum{[y_i\cdot log(p_i)+(1-y_i)\cdot log(1-p_i)]}\) Where \(p_i=\\frac{1}{1+e^{-(\\beta_1x_i+\\beta_0)}} \)',
'assumptions': '1. The dependent variable is 0 or 1\n \
2. Independent observations\n \
3. Linear relationship between predictors\n \
4. Large sample size\n \
5. Independent variables are not highly correlated',
'complexity': '\(Train: O(n*m)\\\Test: O(m) \\\Space: O(m)\)',
'info-href': 'https://en.wikipedia.org/wiki/Logistic_regression',
'info-text': 'https://en.wikipedia.org/wiki/Logstic_regression'
}
# Regression Metrics #
# Return a value between 0 and 1, a larger value represents a greater fit
def r2_score(Y_pred, Y_actual):
RSS = np.sum(Y_actual - Y_pred)**2 # Residual sum of squares
TSS = np.sum(Y_actual - Y_pred) # Total sum of squares
return 1 - (RSS / TSS)
# Mean Squared Error
# Returns a value between 0 and infinity
def mse(Y_pred, Y_actual):
n = len(Y_pred)
return (np.sum(Y_actual - Y_pred) ** 2) / n
# Root Mean Squared Error (The Square root of MSE)
# Returns a value between 0 and infinity
def rmse(Y_pred, Y_actual):
n = len(Y_pred)
return np.sqrt((np.sum(Y_actual - Y_pred) ** 2) / n)
# Mean Absolute Error
def mae(Y_pred, Y_actual):
n = len(Y_pred)
return np.abs(np.sum(Y_actual - Y_pred)) / n
# Reduced Chi Squared, an estimate for the variance of the error term
def reduced_chi_squared(Y_pred, Y_actual):
n = len(Y_actual) # Number of rows
p = 1 # Number of independent variables
dof = n - p # Statistical degrees of freedom
return np.dot(Y_pred.T, Y_pred) / dof
# Reduced standard error, the square root of reduced chi squared
def rse(Y_pred, Y_actual):
rcs = reduced_chi_squared(Y_actual, Y_pred)
return np.sqrt(rcs)
# Classification Metrics
# Recall, the proportion of true positives among all positive samples
def recall(Y_actual, Y_pred):
tp = np.sum((Y_actual == 1) & (Y_pred == 1))
fn = np.sum((Y_actual == 1) & (Y_pred == -1))
return tp / (tp + fn)
# Precision, the proportion of true positives among predicted positive samples
def precision(Y_actual, Y_pred):
tp = np.sum((Y_actual == 1) & (Y_pred == 1))
fp = np.sum((Y_actual == -1) & (Y_pred == 1))
return tp / (tp + fp)
# Specificity, the number of negative proerly identified out of all negative cases
def specificity(Y_actual, Y_pred):
tn = np.sum((Y_actual == -1) & (Y_pred == -1))
fp = np.sum((Y_actual == -1) & (Y_pred == 1))
return tn / (tn + fp)
# F1 Score, the harmonic mean of precision and recall
def f1_score(Y_actual, Y_pred):
p = precision(Y_actual, Y_pred)
r = recall(Y_actual, Y_pred)
return 2 * (p * r) / (p + r)
# Accuracy, the proportion of correctly classified samples
def accuracy(Y_actual, Y_pred):
return np.mean(Y_actual == Y_pred)