Skip to content

Commit 43216b9

Browse files
authored
Add files via upload
1 parent 9b87157 commit 43216b9

File tree

1 file changed

+217
-0
lines changed
  • samples/features/machine-learning-services/python/getting-started/bike-sharing prediction

1 file changed

+217
-0
lines changed
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
'''
2+
Pipeline implementation
3+
4+
5+
'''
6+
7+
8+
9+
import numpy as np
10+
import pandas as pd
11+
import time
12+
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
13+
from sklearn.preprocessing import StandardScaler
14+
from revoscalepy.functions.RxLogit import rx_logit_ex
15+
from revoscalepy.functions.RxPredict import rx_predict_ex
16+
17+
18+
#=========================
19+
20+
# Features engineering
21+
22+
#=========================
23+
24+
class OutliersHandler(BaseEstimator, TransformerMixin):
25+
"""Handle outliers"""
26+
27+
def fit(self, x, y = None):
28+
return self
29+
30+
def transform(self, df):
31+
32+
df.availablebikes = np.where(df.availablebikes > df.bikestands, df.bikestands, df.availablebikes)
33+
return df
34+
35+
class LabelDefiner(BaseEstimator, TransformerMixin):
36+
"""
37+
38+
Defines target variable
39+
Binary label 0 empty station, 1 otherwise
40+
41+
"""
42+
43+
def __init__(self, availability_threshold = 1):
44+
self.threshold = availability_threshold
45+
46+
def fit(self, x, y = None):
47+
return self
48+
49+
def transform(self, df):
50+
51+
df['label'] = np.where(df.availablebikes < self.threshold, 0, 1)
52+
return df
53+
54+
55+
56+
class DateTimeFeaturesExtractor(BaseEstimator, TransformerMixin):
57+
"""Extract Datetime features"""
58+
59+
def fit(self, x, y = None):
60+
return self
61+
62+
def transform(self, df):
63+
df['lastupdate']= pd.to_datetime(df['lastupdate'])
64+
df['day'] = df.lastupdate.dt.day.astype(int)
65+
df['month'] = df.lastupdate.dt.month.astype(int)
66+
df['hour']= df.lastupdate.dt.hour.astype(int)
67+
df['minute']= df.lastupdate.dt.minute.astype(int)
68+
df['isweekend'] = np.where(df['lastupdate'].dt.dayofweek > 4, 1, 0)
69+
df.sort_values(by='lastupdate', inplace = True)
70+
return df
71+
72+
73+
74+
class TSFeaturesExtractor(BaseEstimator, TransformerMixin):
75+
"""Extract time series related features"""
76+
77+
def __init__(self, max_lags = 4):
78+
self.__max_lags = max_lags
79+
80+
def fit(self, x, y=None):
81+
return self
82+
83+
def transform(self, df):
84+
85+
86+
df.sort_values(['lastupdate','stationid'], ascending = [True, True])
87+
88+
for i in range(self.__max_lags):
89+
df['lag' + str(i)] = df.groupby(['stationid'])['availablebikes'].shift(i + 1)
90+
91+
92+
df['1st_derivative'] = df.groupby('stationid')['lag0'].transform(lambda x: np.gradient(x))
93+
df['2nd_derivative'] = df.groupby('stationid')['1st_derivative'].transform(lambda x: np.gradient(x))
94+
df['fft_max_coeff'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform(lambda x: np.amax(np.abs(np.fft.rfft(x))))
95+
df['fft_energy'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform(lambda x: np.sum((np.abs(np.fft.rfft(x))) ** 2))
96+
97+
return df
98+
99+
100+
101+
102+
103+
class StatisticalFeaturesExtractor(BaseEstimator, TransformerMixin):
104+
"""Extract statistical related features"""
105+
106+
def __init__(self, max_lags = 4):
107+
self.__max_lags = max_lags
108+
109+
def fit(self, x, y=None):
110+
return self
111+
112+
def transform(self, df):
113+
114+
df['var'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform('var')
115+
df['cumrelfreq'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].cumsum() / self.__max_lags
116+
df['mad'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform('mad')
117+
df['idxmax'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform(\
118+
lambda x: np.argmax(x.ravel()))
119+
df['idxmin'] = df.groupby(['stationid', 'month', 'day', 'hour'])['lag0'].transform(\
120+
lambda x: np.argmin(x.ravel()))
121+
return df
122+
123+
124+
125+
126+
class FeaturesExcluder(BaseEstimator, TransformerMixin):
127+
"""features to exclude"""
128+
129+
def __init__(self, features = ['availablebikes', 'bikestands','lastupdate', 'zipcode','month', 'day']):
130+
self.__exclusionlist = features
131+
132+
def fit(self, X, y = None):
133+
return self
134+
135+
def transform(self, df):
136+
137+
df.drop(self.__exclusionlist, axis = 1, inplace = True)
138+
return df
139+
140+
141+
class FeaturesScaler(BaseEstimator, TransformerMixin):
142+
143+
"""Z-score scaler """
144+
145+
146+
147+
def fit(self, X, y = None):
148+
return self
149+
150+
def transform(self, df):
151+
152+
if df.isnull().any().any():
153+
df.dropna(inplace = True)
154+
cols = df.columns.tolist()
155+
excluded_cols = ['stationid', 'label','hour', 'minute', 'isweekend']
156+
157+
X = StandardScaler().fit_transform(df.drop(excluded_cols, axis=1, inplace = False))
158+
X = np.concatenate((df.loc[:, excluded_cols].as_matrix(), X), axis = 1)
159+
160+
df_out = pd.DataFrame(X, columns = cols)
161+
162+
return df_out
163+
164+
165+
class RxClassifier(BaseEstimator, ClassifierMixin):
166+
167+
""" Revoscalerpy logisitic regression binary classifier wrapped in sklearn estimator """
168+
169+
def __init__(self, computecontext):
170+
171+
self.__computecontext = computecontext
172+
173+
174+
def fit(self, X, y = None):
175+
176+
177+
"""Fit model to training data
178+
179+
180+
Args:
181+
X (pandas DataFrame): training data.
182+
y (None): Not used the target variable is passed in X.
183+
184+
return: coefficients (pandas DataFrame)
185+
186+
"""
187+
188+
formula = "label ~ F(stationid) + F(hour) + F(minute) + isweekend + lag0 + \
189+
lag1 + lag2 + lag3 + 1st_derivative + 2nd_derivative\
190+
+ fft_max_coeff + fft_energy + var + cumrelfreq + mad + idxmax + idxmin"
191+
192+
start = time.time()
193+
self.__clf = rx_logit_ex(formula, data = X, compute_context = self.__computecontext, report_progress = 3, verbose = 1)
194+
end = time.time()
195+
196+
print("Training time duration: %.2f seconds" % (end - start))
197+
return self.__clf.coefficients
198+
199+
200+
def predict(self, X):
201+
"""
202+
Perform classification on X
203+
204+
Args:
205+
X (pandas DataFrame): prediction input dataset
206+
207+
return: prediction results vector (numpy array)
208+
"""
209+
if self.__clf is None:
210+
raise RuntimeError("Data must be fitted before calling predict!")
211+
212+
predict = rx_predict_ex(self.__clf, data = X, compute_context = self.__computecontext)
213+
predictions = np.where(predict._results['label_Pred'] == 1, 1, 0)
214+
215+
return predictions
216+
217+

0 commit comments

Comments
 (0)