Skip to content

Commit 98d79c5

Browse files
Fixed small bug in manifest. Better readme file. Included examples previously made in another repository
1 parent 80053cf commit 98d79c5

File tree

4 files changed

+184
-8
lines changed

4 files changed

+184
-8
lines changed

MANIFEST.in

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
recursive-include dependencies/pybind11 *.*
2-
recursive-include dependencies/eigen-master *.*
1+
recursive-include dependencies/pybind11 *
2+
recursive-include dependencies/eigen-master *
33
recursive-include cpp *.h *.cpp
44
include LICENSE
55
include README.md

README.md

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
1-
# aplr
2-
Automatic Piecewise Linear Regression
1+
# APLR
2+
Automatic Piecewise Linear Regression.
33

4-
# about
4+
# About
55
Build interpretable parametric machine learning models in Python based on the Automatic Piecewise Linear Regression methodology developed by Mathias von Ottenbreit.
66

7-
# how to install
7+
# How to install
88
pip install aplr
99

10-
# how to use
11-
See the two example Python scripts.
10+
# How to use
11+
Please see the two example Python scripts in the example folder. They cover common use cases, but not all of the functionality in this package. For example, fitting with user-specified observation weights is possible but the example scripts do not use this functionality.
12+
13+
# Sponsorship
14+
Please sponsor Ottenbreit Data Science by clicking on the Sponsor button. Sufficient funding will enable maintenance of APLR and further development, such as developing a classifier based on APLR.
15+
16+
# API reference
17+
A thorough API reference will be provided after Ottenbreit Data Science receives a total of 1000$ in sponsorship funds. The reason is that it will take some time to write it and the work will not start until sufficient funding is available.
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import pandas as pd
2+
import pickle
3+
from sklearn.model_selection import GridSearchCV, train_test_split
4+
from sklearn.datasets import load_diabetes
5+
from aplr import APLRRegressor
6+
7+
8+
#Settings
9+
random_state=0
10+
11+
#Loading data
12+
diabetes = load_diabetes()
13+
data = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
14+
data["target"] = pd.Series(diabetes.target)
15+
16+
#Please note that APLR requires that all columns in the data have numerical values.
17+
#This means that if you have missing values in the data then you need to either drop rows with missing data or impute them.
18+
#This also means that if you have a categorical text variable then you need to convert it to for example dummy variables for each category.
19+
20+
#Randomly splitting data into training and test sets
21+
data_train, data_test = train_test_split(data, test_size=0.3, random_state=random_state)
22+
del data
23+
24+
#Predictors and response
25+
predictors=diabetes.feature_names
26+
response="target"
27+
predicted="predicted"
28+
29+
#Training model
30+
param_grid = {"max_interactions":[100000],"max_interaction_level":[0,1,2,3,100],"min_observations_in_split":[1, 20, 50, 100, 200]}
31+
grid_search_cv = GridSearchCV(APLRRegressor(random_state=random_state,verbosity=1,m=1000,v=0.1),param_grid,cv=5,n_jobs=4,scoring="neg_mean_squared_error")
32+
grid_search_cv.fit(data_train[predictors].values,data_train[response].values)
33+
best_model:APLRRegressor = grid_search_cv.best_estimator_
34+
best_model.set_term_names(X_names=predictors)
35+
print("Done training")
36+
37+
#Saving model
38+
pickle.dump(best_model,open("best_model.zip","wb"))
39+
40+
#Cross validation results when doing grid search
41+
cv_results = pd.DataFrame(grid_search_cv.cv_results_).sort_values(by="rank_test_score")
42+
43+
#Validation errors that occurred during training of the best model. APLR used the boosting step that gave the lowest validation error
44+
validation_error_per_boosting_step = best_model.get_validation_error_steps()
45+
46+
#Terms in the best model
47+
terms=pd.DataFrame({"Predictor":best_model.get_term_names(),"Coefficient":best_model.get_term_coefficients()})
48+
49+
#Coefficients for intercept and the first predictor per boosting step
50+
intercept_coefficient_per_boosting_step = best_model.get_intercept_steps()
51+
first_predictor_coefficient_per_boosting_step = best_model.get_term_coefficient_steps(term_index=0)
52+
53+
#Estimated feature importance was estimated on the validation set when the best model was trained
54+
estimated_feature_importance = pd.DataFrame({"predictor":predictors,"importance":best_model.get_feature_importance()})
55+
estimated_feature_importance = estimated_feature_importance.sort_values(by="importance", ascending=False)
56+
57+
58+
#PREDICTING AND TESTING ON THE TEST SET
59+
data_test[predicted]=best_model.predict(data_test[predictors].values)
60+
61+
#Goodness of fit
62+
correlation=pd.DataFrame({"response":data_test[response],"prediction":data_test[predicted]}).corr()
63+
mse=((data_test[response]-data_test[predicted])**2).mean()
64+
mae=(data_test[response]-data_test[predicted]).abs().mean()
65+
goodness_of_fit=pd.DataFrame({"mse":[mse],"mae":[mae],"correlation":[correlation["prediction"][0]]})
66+
goodness_of_fit["r_squared"] = goodness_of_fit["correlation"]**2
67+
68+
#Local feature importance for each prediction
69+
term_names_excluding_intercept = best_model.get_term_names()[1:]
70+
local_feature_importance_of_each_term = pd.DataFrame(
71+
best_model.calculate_local_feature_importance_for_terms(data_test[predictors]),
72+
columns = term_names_excluding_intercept
73+
)
74+
estimated_local_feature_importance_of_each_original_predictor = pd.DataFrame(
75+
best_model.calculate_local_feature_importance(data_test[predictors]),
76+
columns = predictors
77+
)
78+
79+
#Calculate terms on test data
80+
calculated_terms = pd.DataFrame(best_model.calculate_terms(data_test[predictors]), columns = term_names_excluding_intercept)

examples/train_aplr_validation.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import pandas as pd
2+
import numpy as np
3+
import pickle
4+
from sklearn.model_selection import ParameterGrid, train_test_split
5+
from sklearn.datasets import load_diabetes
6+
from aplr import APLRRegressor
7+
8+
9+
#Settings
10+
random_state=0
11+
12+
#Loading data
13+
diabetes = load_diabetes()
14+
data = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
15+
data["target"] = pd.Series(diabetes.target)
16+
17+
#Please note that APLR requires that all columns in the data have numerical values.
18+
#This means that if you have missing values in the data then you need to either drop rows with missing data or impute them.
19+
#This also means that if you have a categorical text variable then you need to convert it to for example dummy variables for each category.
20+
21+
#Randomly splitting data into training and test sets
22+
data_train, data_test = train_test_split(data, test_size=0.3, random_state=random_state)
23+
del data
24+
25+
#Predictors and response
26+
predictors=diabetes.feature_names
27+
response="target"
28+
predicted="predicted"
29+
30+
#Training model
31+
validation_results=pd.DataFrame()
32+
best_validation_result=np.inf
33+
param_grid=ParameterGrid({"max_interactions":[100000],"max_interaction_level":[0,1,2,3,100],"min_observations_in_split":[1, 20, 50, 100, 200]})
34+
bestmodel=None
35+
for params in param_grid:
36+
model = APLRRegressor(random_state=random_state,verbosity=3,m=1000,v=0.1,**params)
37+
model.fit(data_train[predictors].values,data_train[response].values,X_names=predictors)
38+
validation_error_for_this_model=np.min(model.get_validation_error_steps())
39+
validation_results_for_this_model=pd.DataFrame(model.get_params(),index=[0])
40+
validation_results_for_this_model["validation_error"]=validation_error_for_this_model
41+
validation_results=pd.concat([validation_results,validation_results_for_this_model])
42+
if(validation_error_for_this_model<best_validation_result):
43+
best_validation_result=validation_error_for_this_model
44+
best_model=model
45+
print("Done training")
46+
47+
#Saving model
48+
pickle.dump(best_model,open("best_model.zip","wb"))
49+
50+
#Validation results when doing grid search
51+
validation_results = validation_results.sort_values(by="validation_error")
52+
53+
#Validation errors that occurred during training of the best model. APLR used the boosting step that gave the lowest validation error
54+
validation_error_per_boosting_step = best_model.get_validation_error_steps()
55+
56+
#Terms in the best model
57+
terms=pd.DataFrame({"Predictor":best_model.get_term_names(),"Coefficient":best_model.get_term_coefficients()})
58+
59+
#Coefficients for intercept and the first predictor per boosting step
60+
intercept_coefficient_per_boosting_step = best_model.get_intercept_steps()
61+
first_predictor_coefficient_per_boosting_step = best_model.get_term_coefficient_steps(term_index=0)
62+
63+
#Estimated feature importance was estimated on the validation set when the best model was trained
64+
estimated_feature_importance = pd.DataFrame({"predictor":predictors,"importance":best_model.get_feature_importance()})
65+
estimated_feature_importance = estimated_feature_importance.sort_values(by="importance", ascending=False)
66+
67+
68+
#PREDICTING AND TESTING ON THE TEST SET
69+
data_test[predicted]=best_model.predict(data_test[predictors].values)
70+
71+
#Goodness of fit
72+
correlation=pd.DataFrame({"response":data_test[response],"prediction":data_test[predicted]}).corr()
73+
mse=((data_test[response]-data_test[predicted])**2).mean()
74+
mae=(data_test[response]-data_test[predicted]).abs().mean()
75+
goodness_of_fit=pd.DataFrame({"mse":[mse],"mae":[mae],"correlation":[correlation["prediction"][0]]})
76+
goodness_of_fit["r_squared"] = goodness_of_fit["correlation"]**2
77+
78+
#Local feature importance for each prediction
79+
term_names_excluding_intercept = best_model.get_term_names()[1:]
80+
local_feature_importance_of_each_term = pd.DataFrame(
81+
best_model.calculate_local_feature_importance_for_terms(data_test[predictors]),
82+
columns = term_names_excluding_intercept
83+
)
84+
estimated_local_feature_importance_of_each_original_predictor = pd.DataFrame(
85+
best_model.calculate_local_feature_importance(data_test[predictors]),
86+
columns = predictors
87+
)
88+
89+
#Calculate terms on test data
90+
calculated_terms = pd.DataFrame(best_model.calculate_terms(data_test[predictors]), columns = term_names_excluding_intercept)

0 commit comments

Comments
 (0)