ML_lab_DEI_public/exam_utils.py at public · AMCO-UniPD/ML_lab_DEI_public · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np


def random_train_test_split(X, y, proportion_train):
    split_point = int(
        proportion_train * X.shape[0]
    )  # numer of samples in the training set

    shuffled_indices = np.random.permutation(X.shape[0])

    train_indices = shuffled_indices[:split_point]
    test_indices = shuffled_indices[split_point:]

    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]

    return X_train, X_test, y_train, y_test


def stratified_train_test_split(X, y, train_prop):
    """
    Split randomly the dataset into training and test sets while preserving class proportions.

    Args:
    X: array of samples. Shape (n_samples, n_features)
    y: array of labels. Shape (n_samples, ) or (n_samples, 1)
    train_prop: proportion of samples to include in the training set. Must be between 0 and 1.

    Returns:
    X_train: array of training samples. Shape (n_train_samples, n_features)
    X_test: array of test samples. Shape (n_test_samples, n_features)
    y_train: array of training labels. Shape (n_train_samples, ) or (n_train_samples, 1)
    y_test: array of test labels. Shape (n_test_samples, ) or (n_test_samples, 1)
    """
    unique_classes = np.unique(
        y
    )  # get unique class labels (i.e., the different classes)

    train_indices = []
    test_indices = []

    for cls in unique_classes:
        indices = np.where(y == cls)[
            0
        ]  # find indices of samples belonging to class cls
        np.random.shuffle(indices)  # shuffle the indices for randomness
        split_idx = int(indices.shape[0] * train_prop)  # determine split index

        cls_train_indices = indices[
            :split_idx
        ].tolist()  # get the first split_idx indices for training for class cls
        cls_test_indices = indices[
            split_idx:
        ].tolist()  # get the remaining indices for testing for class cls

        train_indices = (
            train_indices + cls_train_indices
        )  # concatenate the training indices for all classes
        test_indices = (
            test_indices + cls_test_indices
        )  # concatenate the test indices for all classes

    # shuffle again to avoid unintended ordering (right now the indices are ordered by class)
    np.random.shuffle(train_indices)
    np.random.shuffle(test_indices)

    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    return X_train, X_test, y_train, y_test