-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
106 lines (81 loc) · 2.85 KB
/
utils.py
File metadata and controls
106 lines (81 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import sys
import numpy as np
from sklearn import datasets
def to_categorical(x, n_col=None):
""" One-hot encoding of nominal values """
if not n_col:
n_col = np.amax(x) + 1
one_hot = np.zeros((x.shape[0], n_col))
one_hot[np.arange(x.shape[0]), x] = 1
return one_hot
def shuffle_data(X, y):
idx = list(range(X.shape[0]))
np.random.shuffle(idx)
X = X[idx]
y = y[idx]
return X, y
def split_train_test(X, y, ratio=0.3):
train_size = int(X.shape[0]*(1-ratio))
X, y = shuffle_data(X, y)
train_X, test_X = X[: train_size], X[train_size: ]
train_y, test_y = y[: train_size], y[train_size: ]
return train_X, train_y, test_X, test_y
def devide_on_feat(X, feat_i, threshold):
split_func = lambda sample: sample[feat_i]>=threshold
left = [sample for sample in X if split_func(sample)]
right = [sample for sample in X if not split_func(sample)]
left, right = np.array(left), np.array(right)
return left, right
def cal_entropy(y):
log2 = lambda x: np.log(x)/np.log(2)
entropy = 0
for unique_value in np.unique(y):
p = np.sum(y==unique_value)/len(y)
entropy += -p*log2(p)
return entropy
def mean_of_y(y):
value = np.mean(y, axis=0)
return value if len(value) > 1 else value[0]
def cal_accuracy(y, preds):
return np.sum(y==preds)/len(y)
def mean_squared_error(y_true, y_pred):
""" Returns the mean squared error between y_true and y_pred """
mse = np.mean(np.power(y_true - y_pred, 2))
return mse
def calculate_variance(X):
""" Return the variance of the features in dataset X """
mean = np.ones(np.shape(X)) * X.mean(0)
n_samples = np.shape(X)[0]
variance = (1 / n_samples) * np.diag((X - mean).T.dot(X - mean))
return variance
def standardize(X):
""" Standardize the dataset X """
X_std = X
mean = X.mean(axis=0)
std = X.std(axis=0)
for col in range(np.shape(X)[1]):
if std[col]:
X_std[:, col] = (X_std[:, col] - mean[col]) / std[col]
# X_std = (X - X.mean(axis=0)) / X.std(axis=0)
return X_std
def get_random_subsets(X, y, n_subsets, replacements=True):
""" Return random subsets (with replacements) of the data """
n_samples = np.shape(X)[0]
# Concatenate x and y and do a random shuffle
X_y = np.concatenate((X, y.reshape((1, len(y))).T), axis=1)
np.random.shuffle(X_y)
subsets = []
# Uses 50% of training samples without replacements
subsample_size = int(n_samples // 2)
if replacements:
subsample_size = n_samples # 100% with replacements
for _ in range(n_subsets):
idx = np.random.choice(
range(n_samples),
size=np.shape(range(subsample_size)),
replace=replacements)
X = X_y[idx][:, :-1]
y = X_y[idx][:, -1]
subsets.append([X, y])
return subsets