|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding:utf-8 -*- |
| 3 | +# @Filename: _RESSEL.py |
| 4 | +# @Author: Daniel Puente Ramírez |
| 5 | +# @Time: 25/4/22 18:49 |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | +from sklearn.metrics import f1_score |
| 10 | + |
| 11 | + |
| 12 | +class RESSEL: |
| 13 | + """ |
| 14 | + de Vries, S., & Thierens, D. (2021). A reliable ensemble based approach |
| 15 | + to semi-supervised learning. Knowledge-Based Systems, 215, 106738. |
| 16 | + """ |
| 17 | + |
| 18 | + def __init__(self, n=10, m=10, k=8, unlabeled_sample_frac=0.75, |
| 19 | + random_state=42, reuse_samples=True): |
| 20 | + self.n = n |
| 21 | + self.m = m |
| 22 | + self.k = k |
| 23 | + self.unlabeled_sample_frac = unlabeled_sample_frac |
| 24 | + self.random_state = random_state |
| 25 | + self.reuse_samples = reuse_samples |
| 26 | + self.ensemble = [] |
| 27 | + |
| 28 | + def fit(self, labeled, unlabeled, base_estimator, estimator_params=None): |
| 29 | + """ |
| 30 | + Build an ensemble based on the base_estimator. |
| 31 | +
|
| 32 | + :param labeled: pandas DataFrame with labeled samples. |
| 33 | + :param unlabeled: pandas DataFrame with unlabeled samples. |
| 34 | + :param base_estimator: base Classifier. |
| 35 | + :param estimator_params: dict of params to pass to the estimator. |
| 36 | + :return: the ensemble in case is needed. |
| 37 | + """ |
| 38 | + |
| 39 | + if not isinstance(labeled, pd.DataFrame): |
| 40 | + raise AttributeError("Labeled samples object needs to be a " |
| 41 | + "Pandas DataFrame. Not a ", type(labeled)) |
| 42 | + |
| 43 | + if not isinstance(unlabeled, pd.DataFrame): |
| 44 | + raise AttributeError("Unlabeled samples object needs to be a " |
| 45 | + "Pandas DataFrame. Not a ", |
| 46 | + type(unlabeled)) |
| 47 | + |
| 48 | + if labeled.shape[1] != unlabeled.shape[1] + 1: |
| 49 | + raise ValueError("Labeled samples must have one more attribute " |
| 50 | + "than the unlabeled ones.", |
| 51 | + labeled.shape[1], unlabeled.shape[1]) |
| 52 | + |
| 53 | + if base_estimator is None: |
| 54 | + raise AttributeError("The base estimator can not be None.") |
| 55 | + |
| 56 | + if estimator_params is None: |
| 57 | + for _ in range(self.k): |
| 58 | + self.ensemble.append(base_estimator()) |
| 59 | + else: |
| 60 | + for _ in range(self.k): |
| 61 | + self.ensemble.append(base_estimator(**estimator_params)) |
| 62 | + |
| 63 | + labeled.columns = [*range(len(labeled.keys()))] |
| 64 | + unlabeled.columns = [*range(len(unlabeled.keys()))] |
| 65 | + |
| 66 | + for i in range(self.k): |
| 67 | + seed = self.random_state[i] if hasattr(self.random_state, |
| 68 | + '__iter__') else \ |
| 69 | + self.random_state |
| 70 | + |
| 71 | + l_i = labeled.sample(n=len(labeled), frac=None, replace=True, |
| 72 | + random_state=seed, ignore_index=True) |
| 73 | + u_i = unlabeled.sample(frac=self.unlabeled_sample_frac, |
| 74 | + replace=False, random_state=seed, |
| 75 | + ignore_index=True) |
| 76 | + |
| 77 | + oob_i = [] |
| 78 | + for sample in labeled.to_numpy(): |
| 79 | + is_in = False |
| 80 | + for selected_sample in l_i.to_numpy(): |
| 81 | + if np.array_equal(sample, selected_sample): |
| 82 | + is_in = True |
| 83 | + break |
| 84 | + if not is_in: |
| 85 | + oob_i.append(sample) |
| 86 | + |
| 87 | + oob_i = pd.DataFrame(oob_i) |
| 88 | + |
| 89 | + d_class_i = l_i[l_i.shape[1] - 1].value_counts( |
| 90 | + sort=False) # n labels |
| 91 | + d_class_i = [x / d_class_i.sum() for x in d_class_i] |
| 92 | + |
| 93 | + self.ensemble[i].fit(l_i.iloc[:, :-1], np.ravel(l_i.iloc[:, -1:])) |
| 94 | + self.__robust_self_training(i, l_i, u_i, oob_i, d_class_i) |
| 95 | + |
| 96 | + return self.ensemble |
| 97 | + |
| 98 | + def __robust_self_training(self, iteration, l_i, u_i, oob_i, d_class_i): |
| 99 | + """Procedure to enrich a given classifier.""" |
| 100 | + |
| 101 | + y_pred = self.ensemble[iteration].predict(oob_i.iloc[:, :-1]) |
| 102 | + best_error_i = f1_score(y_true=np.ravel(oob_i.iloc[:, -1:]), |
| 103 | + y_pred=y_pred, average="weighted") |
| 104 | + best_c_i = self.ensemble[iteration] |
| 105 | + |
| 106 | + for _ in range(self.m): |
| 107 | + prob_i = self.ensemble[iteration].predict_proba(u_i) |
| 108 | + n_labels = len(prob_i[0]) |
| 109 | + |
| 110 | + u_conf_i = [] |
| 111 | + for unlabeled_sample, prob in zip(u_i.to_numpy(), prob_i): |
| 112 | + val = np.argmax(prob) |
| 113 | + u_conf_i.append([unlabeled_sample, val, prob[val]]) |
| 114 | + |
| 115 | + u_conf_i.sort(key=lambda x: x[1], reverse=True) |
| 116 | + samples_pred_label = {x: [] for x in range(n_labels)} |
| 117 | + for sample, val, prob in u_conf_i: |
| 118 | + samples_pred_label[val].append(sample) |
| 119 | + |
| 120 | + proportion = [int(x * self.n) for x in d_class_i] |
| 121 | + |
| 122 | + samples_selected_proportion = [] |
| 123 | + try: |
| 124 | + for prop, (label, samples) in zip(proportion, |
| 125 | + samples_pred_label.items()): |
| 126 | + for k in range(prop): |
| 127 | + sample_temp = list(samples[k]) |
| 128 | + sample_temp.append(label) |
| 129 | + samples_selected_proportion.append(sample_temp) |
| 130 | + |
| 131 | + except IndexError: |
| 132 | + print("Warning: There are not enough samples to keep the " |
| 133 | + "proportion, consider changing the problem to be able " |
| 134 | + "to reuse samples or change the model parametrization. ") |
| 135 | + |
| 136 | + samples_u_best = pd.DataFrame(samples_selected_proportion) |
| 137 | + |
| 138 | + l_i = pd.concat([l_i, samples_u_best], ignore_index=True, axis=0) |
| 139 | + |
| 140 | + if not self.reuse_samples: |
| 141 | + indexes = [] |
| 142 | + for _, sample in samples_u_best.iterrows(): |
| 143 | + sample = sample.to_numpy()[:-1] |
| 144 | + for index, sample_u in u_i.iterrows(): |
| 145 | + if np.array_equal(sample, sample_u.to_numpy()): |
| 146 | + indexes.append(index) |
| 147 | + break |
| 148 | + |
| 149 | + u_i = u_i.drop(index=indexes) |
| 150 | + |
| 151 | + self.ensemble[iteration].fit(l_i.iloc[:, :-1], |
| 152 | + np.ravel(l_i.iloc[:, -1:])) |
| 153 | + |
| 154 | + y_pred = self.ensemble[iteration].predict(oob_i.iloc[:, :-1]) |
| 155 | + current_error_i = f1_score(y_true=np.ravel(oob_i.iloc[:, -1:]), |
| 156 | + y_pred=y_pred, average="weighted") |
| 157 | + |
| 158 | + if current_error_i < best_error_i: |
| 159 | + best_error_i = current_error_i |
| 160 | + best_c_i = self.ensemble[iteration] |
| 161 | + |
| 162 | + self.ensemble[iteration] = best_c_i |
| 163 | + |
| 164 | + def predict(self, samples): |
| 165 | + """ |
| 166 | + Predict using the trained ensemble with a majority vote. |
| 167 | +
|
| 168 | + :param samples: pandas DataFrame or vector shape (n_samples, |
| 169 | + n_attributes) |
| 170 | +
|
| 171 | + :return: numpy array: predicted samples. |
| 172 | + """ |
| 173 | + if isinstance(samples, pd.DataFrame): |
| 174 | + samples = samples.to_numpy() |
| 175 | + if len(self.ensemble) == 0: |
| 176 | + raise InterruptedError("To be able to predict, fitting is needed " |
| 177 | + "to be already done.") |
| 178 | + c_pred = [] |
| 179 | + for classifier in self.ensemble: |
| 180 | + c_pred.append(classifier.predict(samples)) |
| 181 | + c_pred = pd.DataFrame(np.array(c_pred)).mode().iloc[0].to_numpy() |
| 182 | + |
| 183 | + return c_pred |
0 commit comments