|
| 1 | +import sys |
| 2 | +import numpy as np |
| 3 | +from sklearn.pipeline import Pipeline |
| 4 | +from datasource import DataSource |
| 5 | +from pipeline import * |
| 6 | +from revoscalepy.etl.RxImport import rx_import_datasource |
| 7 | +from sklearn.metrics import classification_report |
| 8 | + |
| 9 | + |
| 10 | + |
| 11 | +def run(): |
| 12 | + |
| 13 | + # modify connection string to point to MLS/SQL Server instance where you restored the database |
| 14 | + connectionstring = 'Driver=SQL Server;Server=MLMACHINE\\SQLSERVER17;Database=velibdb;Trusted_Connection=True;' |
| 15 | + |
| 16 | + ds = DataSource(connectionstring) |
| 17 | + df = ds.loaddata() |
| 18 | + |
| 19 | + |
| 20 | + pipeline = Pipeline(steps= [('outliers', OutliersHandler()), |
| 21 | + ('label',LabelDefiner()), |
| 22 | + ('dt', DateTimeFeaturesExtractor()), |
| 23 | + ('ts', TSFeaturesExtractor()), |
| 24 | + ('st', StatisticalFeaturesExtractor()), |
| 25 | + ('exclusion', FeaturesExcluder()), |
| 26 | + ('scaler', FeaturesScaler())] |
| 27 | + ) |
| 28 | + |
| 29 | + # Execute Pipeline |
| 30 | + |
| 31 | + df = pipeline.fit_transform(df) |
| 32 | + |
| 33 | + # split dataset |
| 34 | + |
| 35 | + test_size = 24 * 4 # one day test set of each station |
| 36 | + train = df.groupby('stationid').head(df.shape[0] - test_size) |
| 37 | + test = df.groupby('stationid').tail(test_size) |
| 38 | + |
| 39 | + |
| 40 | + # fit classifier |
| 41 | + |
| 42 | + clf = RxClassifier(computecontext = ds.getcomputecontext()) |
| 43 | + coeffs = clf.fit(train) |
| 44 | + #print coefficients and exclude stationid Factor |
| 45 | + print(coeffs.tail(14)) |
| 46 | + |
| 47 | + |
| 48 | + # run prediction on hold out set and evaluate |
| 49 | + |
| 50 | + y_pred = clf.predict(test.drop(['label'], axis=1, inplace = False)) |
| 51 | + y_truth = test['label'].as_matrix() |
| 52 | + print(classification_report(y_truth, y_pred)) |
| 53 | + |
| 54 | + |
| 55 | + |
| 56 | +if __name__ == "__main__": |
| 57 | + run() |
| 58 | + |
| 59 | + |
0 commit comments