Fixed small bug in manifest. Better readme file. Included examples previously made in another repository

mathias-von-ottenbreit · mathias-von-ottenbreit · commit 98d79c542a0b · 2022-05-13T16:13:39.000+02:00
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,5 @@
-recursive-include dependencies/pybind11 *.*
-recursive-include dependencies/eigen-master *.*
+recursive-include dependencies/pybind11 *
+recursive-include dependencies/eigen-master *
 recursive-include cpp *.h *.cpp
 include LICENSE
 include README.md
diff --git a/README.md b/README.md
@@ -1,11 +1,17 @@
-# aplr
-Automatic Piecewise Linear Regression
+# APLR
+Automatic Piecewise Linear Regression.
 
-# about
+# About
 Build interpretable parametric machine learning models in Python based on the Automatic Piecewise Linear Regression methodology developed by Mathias von Ottenbreit.
 
-# how to install
+# How to install
 pip install aplr
 
-# how to use
-See the two example Python scripts.
+# How to use
+Please see the two example Python scripts in the example folder. They cover common use cases, but not all of the functionality in this package. For example, fitting with user-specified observation weights is possible but the example scripts do not use this functionality.
+
+# Sponsorship
+Please sponsor Ottenbreit Data Science by clicking on the Sponsor button. Sufficient funding will enable maintenance of APLR and further development, such as developing a classifier based on APLR.
+
+# API reference
+A thorough API reference will be provided after Ottenbreit Data Science receives a total of 1000$ in sponsorship funds. The reason is that it will take some time to write it and the work will not start until sufficient funding is available.
diff --git a/examples/train_aplr_cross_validation.py b/examples/train_aplr_cross_validation.py
@@ -0,0 +1,80 @@
+import pandas as pd
+import pickle
+from sklearn.model_selection import GridSearchCV, train_test_split
+from sklearn.datasets import load_diabetes
+from aplr import APLRRegressor
+
+
+#Settings
+random_state=0
+
+#Loading data
+diabetes = load_diabetes()
+data = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
+data["target"] = pd.Series(diabetes.target)
+
+#Please note that APLR requires that all columns in the data have numerical values. 
+#This means that if you have missing values in the data then you need to either drop rows with missing data or impute them.
+#This also means that if you have a categorical text variable then you need to convert it to for example dummy variables for each category.
+
+#Randomly splitting data into training and test sets
+data_train, data_test = train_test_split(data, test_size=0.3, random_state=random_state)
+del data
+
+#Predictors and response
+predictors=diabetes.feature_names
+response="target"
+predicted="predicted"
+
+#Training model
+param_grid = {"max_interactions":[100000],"max_interaction_level":[0,1,2,3,100],"min_observations_in_split":[1, 20, 50, 100, 200]}
+grid_search_cv = GridSearchCV(APLRRegressor(random_state=random_state,verbosity=1,m=1000,v=0.1),param_grid,cv=5,n_jobs=4,scoring="neg_mean_squared_error")
+grid_search_cv.fit(data_train[predictors].values,data_train[response].values)
+best_model:APLRRegressor = grid_search_cv.best_estimator_
+best_model.set_term_names(X_names=predictors)
+print("Done training")
+
+#Saving model
+pickle.dump(best_model,open("best_model.zip","wb"))
+
+#Cross validation results when doing grid search
+cv_results = pd.DataFrame(grid_search_cv.cv_results_).sort_values(by="rank_test_score")
+
+#Validation errors that occurred during training of the best model. APLR used the boosting step that gave the lowest validation error
+validation_error_per_boosting_step = best_model.get_validation_error_steps()
+
+#Terms in the best model
+terms=pd.DataFrame({"Predictor":best_model.get_term_names(),"Coefficient":best_model.get_term_coefficients()})
+
+#Coefficients for intercept and the first predictor per boosting step
+intercept_coefficient_per_boosting_step = best_model.get_intercept_steps()
+first_predictor_coefficient_per_boosting_step = best_model.get_term_coefficient_steps(term_index=0)
+
+#Estimated feature importance was estimated on the validation set when the best model was trained
+estimated_feature_importance = pd.DataFrame({"predictor":predictors,"importance":best_model.get_feature_importance()})
+estimated_feature_importance = estimated_feature_importance.sort_values(by="importance", ascending=False)
+
+
+#PREDICTING AND TESTING ON THE TEST SET
+data_test[predicted]=best_model.predict(data_test[predictors].values)
+
+#Goodness of fit
+correlation=pd.DataFrame({"response":data_test[response],"prediction":data_test[predicted]}).corr()
+mse=((data_test[response]-data_test[predicted])**2).mean()
+mae=(data_test[response]-data_test[predicted]).abs().mean()
+goodness_of_fit=pd.DataFrame({"mse":[mse],"mae":[mae],"correlation":[correlation["prediction"][0]]})
+goodness_of_fit["r_squared"] = goodness_of_fit["correlation"]**2
+
+#Local feature importance for each prediction
+term_names_excluding_intercept = best_model.get_term_names()[1:]
+local_feature_importance_of_each_term = pd.DataFrame(
+    best_model.calculate_local_feature_importance_for_terms(data_test[predictors]),
+    columns = term_names_excluding_intercept
+)
+estimated_local_feature_importance_of_each_original_predictor = pd.DataFrame(
+    best_model.calculate_local_feature_importance(data_test[predictors]),
+    columns = predictors
+)
+
+#Calculate terms on test data
+calculated_terms = pd.DataFrame(best_model.calculate_terms(data_test[predictors]), columns = term_names_excluding_intercept)
diff --git a/examples/train_aplr_validation.py b/examples/train_aplr_validation.py
@@ -0,0 +1,90 @@
+import pandas as pd
+import numpy as np
+import pickle
+from sklearn.model_selection import ParameterGrid, train_test_split
+from sklearn.datasets import load_diabetes
+from aplr import APLRRegressor
+
+
+#Settings
+random_state=0
+
+#Loading data
+diabetes = load_diabetes()
+data = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
+data["target"] = pd.Series(diabetes.target)
+
+#Please note that APLR requires that all columns in the data have numerical values. 
+#This means that if you have missing values in the data then you need to either drop rows with missing data or impute them.
+#This also means that if you have a categorical text variable then you need to convert it to for example dummy variables for each category.
+
+#Randomly splitting data into training and test sets
+data_train, data_test = train_test_split(data, test_size=0.3, random_state=random_state)
+del data
+
+#Predictors and response
+predictors=diabetes.feature_names
+response="target"
+predicted="predicted"
+
+#Training model
+validation_results=pd.DataFrame()
+best_validation_result=np.inf
+param_grid=ParameterGrid({"max_interactions":[100000],"max_interaction_level":[0,1,2,3,100],"min_observations_in_split":[1, 20, 50, 100, 200]})
+bestmodel=None
+for params in param_grid:
+    model = APLRRegressor(random_state=random_state,verbosity=3,m=1000,v=0.1,**params)
+    model.fit(data_train[predictors].values,data_train[response].values,X_names=predictors)
+    validation_error_for_this_model=np.min(model.get_validation_error_steps())
+    validation_results_for_this_model=pd.DataFrame(model.get_params(),index=[0])
+    validation_results_for_this_model["validation_error"]=validation_error_for_this_model
+    validation_results=pd.concat([validation_results,validation_results_for_this_model])
+    if(validation_error_for_this_model<best_validation_result):
+        best_validation_result=validation_error_for_this_model
+        best_model=model
+print("Done training")
+
+#Saving model
+pickle.dump(best_model,open("best_model.zip","wb"))
+
+#Validation results when doing grid search
+validation_results = validation_results.sort_values(by="validation_error")
+
+#Validation errors that occurred during training of the best model. APLR used the boosting step that gave the lowest validation error
+validation_error_per_boosting_step = best_model.get_validation_error_steps()
+
+#Terms in the best model
+terms=pd.DataFrame({"Predictor":best_model.get_term_names(),"Coefficient":best_model.get_term_coefficients()})
+
+#Coefficients for intercept and the first predictor per boosting step
+intercept_coefficient_per_boosting_step = best_model.get_intercept_steps()
+first_predictor_coefficient_per_boosting_step = best_model.get_term_coefficient_steps(term_index=0)
+
+#Estimated feature importance was estimated on the validation set when the best model was trained
+estimated_feature_importance = pd.DataFrame({"predictor":predictors,"importance":best_model.get_feature_importance()})
+estimated_feature_importance = estimated_feature_importance.sort_values(by="importance", ascending=False)
+
+
+#PREDICTING AND TESTING ON THE TEST SET
+data_test[predicted]=best_model.predict(data_test[predictors].values)
+
+#Goodness of fit
+correlation=pd.DataFrame({"response":data_test[response],"prediction":data_test[predicted]}).corr()
+mse=((data_test[response]-data_test[predicted])**2).mean()
+mae=(data_test[response]-data_test[predicted]).abs().mean()
+goodness_of_fit=pd.DataFrame({"mse":[mse],"mae":[mae],"correlation":[correlation["prediction"][0]]})
+goodness_of_fit["r_squared"] = goodness_of_fit["correlation"]**2
+
+#Local feature importance for each prediction
+term_names_excluding_intercept = best_model.get_term_names()[1:]
+local_feature_importance_of_each_term = pd.DataFrame(
+    best_model.calculate_local_feature_importance_for_terms(data_test[predictors]),
+    columns = term_names_excluding_intercept
+)
+estimated_local_feature_importance_of_each_original_predictor = pd.DataFrame(
+    best_model.calculate_local_feature_importance(data_test[predictors]),
+    columns = predictors
+)
+
+#Calculate terms on test data
+calculated_terms = pd.DataFrame(best_model.calculate_terms(data_test[predictors]), columns = term_names_excluding_intercept)