ml_from_scratch/utils.py at master · wshuail/ml_from_scratch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import sys
import numpy as np
from sklearn import datasets


def to_categorical(x, n_col=None):
    """ One-hot encoding of nominal values """
    if not n_col:
        n_col = np.amax(x) + 1
    one_hot = np.zeros((x.shape[0], n_col))
    one_hot[np.arange(x.shape[0]), x] = 1
    return one_hot


def shuffle_data(X, y):
    idx = list(range(X.shape[0]))
    np.random.shuffle(idx)
    X = X[idx]
    y = y[idx]
    return X, y


def split_train_test(X, y, ratio=0.3):
    train_size = int(X.shape[0]*(1-ratio))
    X, y = shuffle_data(X, y)
    train_X, test_X = X[: train_size], X[train_size: ]
    train_y, test_y = y[: train_size], y[train_size: ]
    return train_X, train_y, test_X, test_y


def devide_on_feat(X, feat_i, threshold):
    split_func = lambda sample: sample[feat_i]>=threshold
    left = [sample for sample in X if split_func(sample)]
    right = [sample for sample in X if not split_func(sample)]
    left, right = np.array(left), np.array(right)
    return left, right


def cal_entropy(y):
    log2 = lambda x: np.log(x)/np.log(2)
    entropy = 0
    for unique_value in np.unique(y):
        p = np.sum(y==unique_value)/len(y)
        entropy += -p*log2(p)
    return entropy

def mean_of_y(y):
    value = np.mean(y, axis=0)
    return value if len(value) > 1 else value[0]

def cal_accuracy(y, preds):
    return np.sum(y==preds)/len(y)

def mean_squared_error(y_true, y_pred):
    """ Returns the mean squared error between y_true and y_pred """
    mse = np.mean(np.power(y_true - y_pred, 2))
    return mse

def calculate_variance(X):
    """ Return the variance of the features in dataset X """
    mean = np.ones(np.shape(X)) * X.mean(0)
    n_samples = np.shape(X)[0]
    variance = (1 / n_samples) * np.diag((X - mean).T.dot(X - mean))

    return variance

def standardize(X):
    """ Standardize the dataset X """
    X_std = X
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    for col in range(np.shape(X)[1]):
        if std[col]:
            X_std[:, col] = (X_std[:, col] - mean[col]) / std[col]
    # X_std = (X - X.mean(axis=0)) / X.std(axis=0)
    return X_std


def get_random_subsets(X, y, n_subsets, replacements=True):
    """ Return random subsets (with replacements) of the data """
    n_samples = np.shape(X)[0]
    # Concatenate x and y and do a random shuffle
    X_y = np.concatenate((X, y.reshape((1, len(y))).T), axis=1)
    np.random.shuffle(X_y)
    subsets = []

    # Uses 50% of training samples without replacements
    subsample_size = int(n_samples // 2)
    if replacements:
        subsample_size = n_samples      # 100% with replacements

    for _ in range(n_subsets):
        idx = np.random.choice(
            range(n_samples),
            size=np.shape(range(subsample_size)),
            replace=replacements)
        X = X_y[idx][:, :-1]
        y = X_y[idx][:, -1]
        subsets.append([X, y])
    return subsets