Merge pull request #30 from ternaus/master

fmfn · web-flow · commit 2eb0ce2b7977 · 2016-10-27T17:32:29.000-04:00
Added method points_to_csv that saves known data points to csv file
diff --git a/bayes_opt/bayesian_optimization.py b/bayes_opt/bayesian_optimization.py
@@ -327,3 +327,17 @@ def maximize(self,
         # Print a final report if verbose active.
         if self.verbose:
             self.plog.print_summary()
+
+    def points_to_csv(self, file_name):
+        """
+        After training all points for which we know target variable
+        (both from initialization and optimization) are saved
+
+        :param file_name: name of the file where points will be saved in the csv format
+
+        :return: None
+        """
+
+        points = np.hstack((self.X, np.expand_dims(self.Y, axis=1)))
+        header = ', '.join(self.keys + ['target'])
+        np.savetxt(file_name, points, header=header, delimiter=',')
diff --git a/examples/xgb_example.py b/examples/xgb_example.py
@@ -0,0 +1,79 @@
+"""
+Baysian hyperparameter optimization [https://github.com/fmfn/BayesianOptimization]
+for Mean Absoulte Error objective
+on default features for https://www.kaggle.com/c/allstate-claims-severity
+"""
+
+__author__ = "Vladimir Iglovikov"
+
+import pandas as pd
+import xgboost as xgb
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics import mean_absolute_error
+from bayes_opt import BayesianOptimization
+from tqdm import tqdm
+
+
+def xgb_evaluate(min_child_weight,
+                 colsample_bytree,
+                 max_depth,
+                 subsample,
+                 gamma,
+                 alpha):
+
+    params['min_child_weight'] = int(min_child_weight)
+    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
+    params['max_depth'] = int(max_depth)
+    params['subsample'] = max(min(subsample, 1), 0)
+    params['gamma'] = max(gamma, 0)
+    params['alpha'] = max(alpha, 0)
+
+
+    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
+             seed=random_state,
+             callbacks=[xgb.callback.early_stop(50)])
+
+    return -cv_result['test-mae-mean'].values[-1]
+
+
+def prepare_data():
+    train = pd.read_csv('../input/train.csv')
+    categorical_columns = train.select_dtypes(include=['object']).columns
+
+    for column in tqdm(categorical_columns):
+        le = LabelEncoder()
+        train[column] = le.fit_transform(train[column])
+
+    y = train['loss']
+
+    X = train.drop(['loss', 'id'], 1)
+    xgtrain = xgb.DMatrix(X, label=y)
+
+    return xgtrain
+
+
+if __name__ == '__main__':
+    xgtrain = prepare_data()
+
+    num_rounds = 3000
+    random_state = 2016
+    num_iter = 25
+    init_points = 5
+    params = {
+        'eta': 0.1,
+        'silent': 1,
+        'eval_metric': 'mae',
+        'verbose_eval': True,
+        'seed': random_state
+    }
+
+    xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (1, 20),
+                                                'colsample_bytree': (0.5, 1),
+                                                'max_depth': (5, 15),
+                                                'subsample': (0.5, 1),
+                                                'gamma': (0, 10),
+                                                'alpha': (0, 10),
+                                                })
+
+    xgbBO.maximize(init_points=init_points, n_iter=num_iter)
+