|
| 1 | +''' |
| 2 | + Pipeline implementation |
| 3 | +
|
| 4 | + |
| 5 | +''' |
| 6 | + |
| 7 | + |
| 8 | + |
| 9 | +import numpy as np |
| 10 | +import pandas as pd |
| 11 | +import time |
| 12 | +from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin |
| 13 | +from sklearn.preprocessing import StandardScaler |
| 14 | +from revoscalepy.functions.RxLogit import rx_logit_ex |
| 15 | +from revoscalepy.functions.RxPredict import rx_predict_ex |
| 16 | + |
| 17 | + |
| 18 | +#========================= |
| 19 | + |
| 20 | +# Features engineering |
| 21 | + |
| 22 | +#========================= |
| 23 | + |
| 24 | +class OutliersHandler(BaseEstimator, TransformerMixin): |
| 25 | + """Handle outliers""" |
| 26 | + |
| 27 | + def fit(self, x, y = None): |
| 28 | + return self |
| 29 | + |
| 30 | + def transform(self, df): |
| 31 | + |
| 32 | + df.availablebikes = np.where(df.availablebikes > df.bikestands, df.bikestands, df.availablebikes) |
| 33 | + return df |
| 34 | + |
| 35 | +class LabelDefiner(BaseEstimator, TransformerMixin): |
| 36 | + """ |
| 37 | +
|
| 38 | + Defines target variable |
| 39 | + Binary label 0 empty station, 1 otherwise |
| 40 | +
|
| 41 | + """ |
| 42 | + |
| 43 | + def __init__(self, availability_threshold = 1): |
| 44 | + self.threshold = availability_threshold |
| 45 | + |
| 46 | + def fit(self, x, y = None): |
| 47 | + return self |
| 48 | + |
| 49 | + def transform(self, df): |
| 50 | + |
| 51 | + df['label'] = np.where(df.availablebikes < self.threshold, 0, 1) |
| 52 | + return df |
| 53 | + |
| 54 | + |
| 55 | + |
| 56 | +class DateTimeFeaturesExtractor(BaseEstimator, TransformerMixin): |
| 57 | + """Extract Datetime features""" |
| 58 | + |
| 59 | + def fit(self, x, y = None): |
| 60 | + return self |
| 61 | + |
| 62 | + def transform(self, df): |
| 63 | + df['lastupdate']= pd.to_datetime(df['lastupdate']) |
| 64 | + df['day'] = df.lastupdate.dt.day.astype(int) |
| 65 | + df['month'] = df.lastupdate.dt.month.astype(int) |
| 66 | + df['hour']= df.lastupdate.dt.hour.astype(int) |
| 67 | + df['minute']= df.lastupdate.dt.minute.astype(int) |
| 68 | + df['isweekend'] = np.where(df['lastupdate'].dt.dayofweek > 4, 1, 0) |
| 69 | + df.sort_values(by='lastupdate', inplace = True) |
| 70 | + return df |
| 71 | + |
| 72 | + |
| 73 | + |
| 74 | +class TSFeaturesExtractor(BaseEstimator, TransformerMixin): |
| 75 | + """Extract time series related features""" |
| 76 | + |
| 77 | + def __init__(self, max_lags = 4): |
| 78 | + self.__max_lags = max_lags |
| 79 | + |
| 80 | + def fit(self, x, y=None): |
| 81 | + return self |
| 82 | + |
| 83 | + def transform(self, df): |
| 84 | + |
| 85 | + |
| 86 | + df.sort_values(['lastupdate','stationid'], ascending = [True, True]) |
| 87 | + |
| 88 | + for i in range(self.__max_lags): |
| 89 | + df['lag' + str(i)] = df.groupby(['stationid'])['availablebikes'].shift(i + 1) |
| 90 | + |
| 91 | + |
| 92 | + df['1st_derivative'] = df.groupby('stationid')['lag0'].transform(lambda x: np.gradient(x)) |
| 93 | + df['2nd_derivative'] = df.groupby('stationid')['1st_derivative'].transform(lambda x: np.gradient(x)) |
| 94 | + df['fft_max_coeff'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform(lambda x: np.amax(np.abs(np.fft.rfft(x)))) |
| 95 | + df['fft_energy'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform(lambda x: np.sum((np.abs(np.fft.rfft(x))) ** 2)) |
| 96 | + |
| 97 | + return df |
| 98 | + |
| 99 | + |
| 100 | + |
| 101 | + |
| 102 | + |
| 103 | +class StatisticalFeaturesExtractor(BaseEstimator, TransformerMixin): |
| 104 | + """Extract statistical related features""" |
| 105 | + |
| 106 | + def __init__(self, max_lags = 4): |
| 107 | + self.__max_lags = max_lags |
| 108 | + |
| 109 | + def fit(self, x, y=None): |
| 110 | + return self |
| 111 | + |
| 112 | + def transform(self, df): |
| 113 | + |
| 114 | + df['var'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform('var') |
| 115 | + df['cumrelfreq'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].cumsum() / self.__max_lags |
| 116 | + df['mad'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform('mad') |
| 117 | + df['idxmax'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform(\ |
| 118 | + lambda x: np.argmax(x.ravel())) |
| 119 | + df['idxmin'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform(\ |
| 120 | + lambda x: np.argmin(x.ravel())) |
| 121 | + return df |
| 122 | + |
| 123 | + |
| 124 | + |
| 125 | + |
| 126 | +class FeaturesExcluder(BaseEstimator, TransformerMixin): |
| 127 | + """features to exclude""" |
| 128 | + |
| 129 | + def __init__(self, features = ['availablebikes', 'bikestands','lastupdate', 'zipcode','month', 'day']): |
| 130 | + self.__exclusionlist = features |
| 131 | + |
| 132 | + def fit(self, X, y = None): |
| 133 | + return self |
| 134 | + |
| 135 | + def transform(self, df): |
| 136 | + |
| 137 | + df.drop(self.__exclusionlist, axis = 1, inplace = True) |
| 138 | + return df |
| 139 | + |
| 140 | + |
| 141 | +class FeaturesScaler(BaseEstimator, TransformerMixin): |
| 142 | + |
| 143 | + """Z-score scaler """ |
| 144 | + |
| 145 | + |
| 146 | + |
| 147 | + def fit(self, X, y = None): |
| 148 | + return self |
| 149 | + |
| 150 | + def transform(self, df): |
| 151 | + |
| 152 | + if df.isnull().any().any(): |
| 153 | + df.dropna(inplace = True) |
| 154 | + cols = df.columns.tolist() |
| 155 | + excluded_cols = ['stationid', 'label','hour', 'minute', 'isweekend'] |
| 156 | + |
| 157 | + X = StandardScaler().fit_transform(df.drop(excluded_cols, axis=1, inplace = False)) |
| 158 | + X = np.concatenate((df.loc[:, excluded_cols].as_matrix(), X), axis = 1) |
| 159 | + |
| 160 | + df_out = pd.DataFrame(X, columns = cols) |
| 161 | + |
| 162 | + return df_out |
| 163 | + |
| 164 | + |
| 165 | +class RxClassifier(BaseEstimator, ClassifierMixin): |
| 166 | + |
| 167 | + """ Revoscalerpy logisitic regression binary classifier wrapped in sklearn estimator """ |
| 168 | + |
| 169 | + def __init__(self, computecontext): |
| 170 | + |
| 171 | + self.__computecontext = computecontext |
| 172 | + |
| 173 | + |
| 174 | + def fit(self, X, y = None): |
| 175 | + |
| 176 | + |
| 177 | + """Fit model to training data |
| 178 | +
|
| 179 | +
|
| 180 | + Args: |
| 181 | + X (pandas DataFrame): training data. |
| 182 | + y (None): Not used the target variable is passed in X. |
| 183 | +
|
| 184 | + return: coefficients (pandas DataFrame) |
| 185 | + |
| 186 | + """ |
| 187 | + |
| 188 | + formula = "label ~ F(stationid) + F(hour) + F(minute) + isweekend + lag0 + \ |
| 189 | + lag1 + lag2 + lag3 + 1st_derivative + 2nd_derivative\ |
| 190 | + + fft_max_coeff + fft_energy + var + cumrelfreq + mad + idxmax + idxmin" |
| 191 | + |
| 192 | + start = time.time() |
| 193 | + self.__clf = rx_logit_ex(formula, data = X, compute_context = self.__computecontext, report_progress = 3, verbose = 1) |
| 194 | + end = time.time() |
| 195 | + |
| 196 | + print("Training time duration: %.2f seconds" % (end - start)) |
| 197 | + return self.__clf.coefficients |
| 198 | + |
| 199 | + |
| 200 | + def predict(self, X): |
| 201 | + """ |
| 202 | + Perform classification on X |
| 203 | +
|
| 204 | + Args: |
| 205 | + X (pandas DataFrame): prediction input dataset |
| 206 | +
|
| 207 | + return: prediction results vector (numpy array) |
| 208 | + """ |
| 209 | + if self.__clf is None: |
| 210 | + raise RuntimeError("Data must be fitted before calling predict!") |
| 211 | + |
| 212 | + predict = rx_predict_ex(self.__clf, data = X, compute_context = self.__computecontext) |
| 213 | + predictions = np.where(predict._results['label_Pred'] == 1, 1, 0) |
| 214 | + |
| 215 | + return predictions |
| 216 | + |
| 217 | + |
0 commit comments