[R-package][python-package] changes in demos

fabsig · fabsig · commit 60bad153e2e7 · 2024-11-16T08:54:28.000+01:00
diff --git a/R-package/demo/GPBoost_algorithm.R b/R-package/demo/GPBoost_algorithm.R
@@ -127,54 +127,92 @@ legend(legend=c("True F","Pred F"), "bottomright", bty="n", lwd=3, col=c(2,4))
 plot(b1, pred$random_effect_mean, xlab="truth", ylab="predicted",
      main="Comparison of true and predicted random effects")
 
+#--------------------Choosing tuning parameters using Bayesian optimization and the 'mlrMBO' R package ----------------
+library(mlrMBO)
+library(DiceKriging)
+library(rgenoud)
+source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/R_package_tune_pars_bayesian_optimization.R")# Load required function
+# Define search space
+# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
+search_space <- list("learning_rate" = c(0.001, 10), 
+                     "min_data_in_leaf" = c(1, 1000),
+                     "max_depth" = c(-1, -1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
+                     "num_leaves" = c(2, 2^10),
+                     "lambda_l2" = c(0, 100),
+                     "max_bin" = c(63, min(n,10000)),
+                     "line_search_step_length" = c(TRUE, FALSE))
+metric = "mse" # Define metric
+if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) {
+  metric = "binary_logloss"
+}
+# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
+gp_model <- GPModel(group_data = group, likelihood = likelihood)
+data_train <- gpb.Dataset(data = X, label = y)
+# Run parameter optimization using Bayesian optimization and k-fold CV 
+crit = makeMBOInfillCritCB() # other criterion options: makeMBOInfillCritEI()
+opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100,
+                                              data = dataset, gp_model = gp_model,
+                                              nfold = 5, nrounds = 1000, early_stopping_rounds = 20,
+                                              metric = metric, crit = crit,
+                                              cv_seed = 4, verbose_eval = 1)
+print(paste0("Best parameters: ", paste0(unlist(lapply(seq_along(opt_params$best_params), 
+                                  function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, y=opt_params$best_params, 
+                                  n=names(opt_params$best_params))), collapse=", ")))
+print(paste0("Best number of iterations: ", opt_params$best_iter))
+print(paste0("Best score: ", round(opt_params$best_score, digits=3)))
+
+# Alternatively and faster: using manually defined validation data instead of cross-validation
+valid_tune_idx <- sample.int(length(y), as.integer(0.2*length(y))) # use 20% of the data as validation data
+folds <- list(valid_tune_idx)
+opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100,
+                                              data = dataset, gp_model = gp_model,
+                                              folds = folds, nrounds = 1000, early_stopping_rounds = 20,
+                                              metric = metric, crit = crit, 
+                                              cv_seed = 4, verbose_eval = 1)
+
 #--------------------Choosing tuning parameters using random grid search----------------
 param_grid <- list("learning_rate" = c(0.001, 0.01, 0.1, 1, 10), 
                    "min_data_in_leaf" = c(1, 10, 100, 1000),
-                   "max_depth" = c(-1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
+                   "max_depth" = c(-1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
                    "num_leaves" = 2^(1:10),
                    "lambda_l2" = c(0, 1, 10, 100),
                    "max_bin" = c(250, 500, 1000, min(n,10000)),
                    "line_search_step_length" = c(TRUE, FALSE))
-other_params <- list(verbose = 0) # avoid trace information when training models
-# Define metric
-metric = "mse"
+metric = "mse" # Define metric
 if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) {
   metric = "binary_logloss"
 }
 # Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
 gp_model <- GPModel(group_data = group, likelihood = likelihood)
 data_train <- gpb.Dataset(data = X, label = y)
 set.seed(1)
-opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, params = other_params,
-                                              num_try_random = 100, nfold = 4,
+# Run parameter optimization using random grid search and k-fold CV
+# Note: deterministic grid search can be done by setting 'num_try_random=NULL'
+opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid,
                                               data = data_train, gp_model = gp_model,
-                                              use_gp_model_for_validation = TRUE, verbose_eval = 1,
+                                              num_try_random = 100, nfold = 5,
                                               nrounds = 1000, early_stopping_rounds = 20,
-                                              metric = metric)
-print(paste0("Best parameters: ",
-             paste0(unlist(lapply(seq_along(opt_params$best_params), 
-                                  function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, 
-                                  y=opt_params$best_params, 
+                                              verbose_eval = 1, metric = metric, cv_seed = 4)
+print(paste0("Best parameters: ", paste0(unlist(lapply(seq_along(opt_params$best_params), 
+                                  function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, y=opt_params$best_params, 
                                   n=names(opt_params$best_params))), collapse=", ")))
 print(paste0("Best number of iterations: ", opt_params$best_iter))
 print(paste0("Best score: ", round(opt_params$best_score, digits=3)))
 
 # Alternatively and faster: using manually defined validation data instead of cross-validation
 valid_tune_idx <- sample.int(length(y), as.integer(0.2*length(y))) # use 20% of the data as validation data
 folds <- list(valid_tune_idx)
-opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, params = other_params,
-                                              num_try_random = 100, folds = folds,
-                                              data = dataset, gp_model = gp_model,
-                                              use_gp_model_for_validation = TRUE, verbose_eval = 1,
+opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, 
+                                              data = data_train, gp_model = gp_model,
+                                              num_try_random = 5, folds = folds,
                                               nrounds = 1000, early_stopping_rounds = 20,
-                                              metric = metric)
+                                              verbose_eval = 1, metric = metric, cv_seed = 4)
 
 #--------------------Cross-validation for determining number of iterations----------------
 gp_model <- GPModel(group_data = group, likelihood = likelihood)
 dataset <- gpb.Dataset(data = X, label = y)
-bst <- gpb.cv(data = dataset, gp_model = gp_model,
-              use_gp_model_for_validation = TRUE, params = params,
-              nrounds = 1000, nfold = 4, early_stopping_rounds = 20)
+bst <- gpb.cv(data = dataset, gp_model = gp_model, params = params,
+              nrounds = 1000, nfold = 5, early_stopping_rounds = 20)
 print(paste0("Optimal number of iterations: ", bst$best_iter))
 
 #--------------------Using a validation set for finding number of iterations----------------
@@ -188,7 +226,7 @@ gp_model <- GPModel(group_data = group[train_ind], likelihood = likelihood)
 gp_model$set_prediction_data(group_data_pred = group[-train_ind])
 bst <- gpb.train(data = dtrain, gp_model = gp_model, nrounds = 1000,
                  params = params, verbose = 1, valids = valids,
-                 early_stopping_rounds = 20, use_gp_model_for_validation = TRUE)
+                 early_stopping_rounds = 20)
 print(paste0("Optimal number of iterations: ", bst$best_iter,
              ", best test error: ", bst$best_score))
 # Plot validation error
@@ -204,7 +242,7 @@ if (likelihood == "gaussian") {
   params_newton$learning_rate <- 0.1
   bst <- gpb.train(data = dtrain, gp_model = gp_model, nrounds = 1000,
                    params = params_newton, verbose = 1, valids = valids,
-                   early_stopping_rounds = 10, use_gp_model_for_validation = TRUE,
+                   early_stopping_rounds = 20,
                    leaves_newton_update = TRUE)
   print(paste0("Optimal number of iterations: ", bst$best_iter,
                ", best test error: ", bst$best_score))
@@ -245,7 +283,7 @@ shap_long <- shap.prep(bst, X_train = X)
 shap.plot.dependence(data_long = shap_long, x = "Covariate_1",
                      color_feature = "Covariate_2", smooth = FALSE)
 # SHAP interaction values
-source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/unify_gpboost_treeshap.R")# Load required function
+source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/R_package_unify_gpboost_treeshap.R")# Load required function
 library(treeshap)
 library(shapviz)
 unified_bst <- gpboost_unify_treeshap(bst, X)
@@ -286,9 +324,8 @@ gp_model <- GPModel(group_data = group, likelihood = likelihood)
 dataset <- gpb.Dataset(X, label = y)
 # Stage 1: run cross-validation to (i) determine to optimal number of iterations
 #           and (ii) to estimate the GPModel on the out-of-sample data
-cvbst <- gpb.cv(data = dataset, gp_model = gp_model,
-                use_gp_model_for_validation = TRUE, params = params,
-                nrounds = 1000, nfold = 4, early_stopping_rounds = 5,
+cvbst <- gpb.cv(data = dataset, gp_model = gp_model, params = params,
+                nrounds = 1000, nfold = 5, early_stopping_rounds = 20,
                 fit_GP_cov_pars_OOS = TRUE, verbose = 0)
 print(paste0("Optimal number of iterations: ", cvbst$best_iter))
 # Fitted model (note: ideally, one would have to find the optimal combination of 
diff --git a/examples/python-guide/GPBoost_algorithm.py b/examples/python-guide/GPBoost_algorithm.py
@@ -153,21 +153,22 @@ def simulate_response_variable(lp, rand_eff, likelihood):
 # Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
 search_space = { 'learning_rate': [0.001, 10],
                 'min_data_in_leaf': [1, 1000],
-                'max_depth': [-1,-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., 'max_depth': [-1,10]
+                'max_depth': [-1,-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., 'max_depth': [-1,10]
                 'num_leaves': [2, 1024],
                 'lambda_l2': [0, 100],
                 'max_bin': [63, np.min([10000,n])],
                 'line_search_step_length': [True, False] }
-# Define metric
-metric = "mse"
+metric = "mse" # Define metric
 if likelihood in ("bernoulli_probit", "bernoulli_logit"):
     metric = "binary_logloss"
 # Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
 gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
-# Run parameter optimization using the TPE algorithm and 4-fold CV 
-opt_params = gpb.tune_pars_TPE_algorithm_optuna(X=X, y=y, search_space=search_space, 
-                                                nfold=4, gp_model=gp_model, metric=metric, tpe_seed=1,
-                                                max_num_boost_round=1000, n_trials=100, early_stopping_rounds=20)
+# Run parameter optimization using the TPE algorithm and k-fold CV 
+opt_params = gpb.tune_pars_TPE_algorithm_optuna(search_space=search_space, n_trials=100, 
+                                                X=X, y=y, gp_model=gp_model,
+                                                max_num_boost_round=1000, early_stopping_rounds=20, 
+                                                nfold=5, metric=metric,
+                                                cv_seed=4, tpe_seed=1)
 print("Best parameters: " + str(opt_params['best_params']))
 print("Best number of iterations: " + str(opt_params['best_iter']))
 print("Best score: " + str(opt_params['best_score']))
@@ -178,35 +179,36 @@ def simulate_response_variable(lp, rand_eff, likelihood):
 train_tune_idx = permute_aux[0:int(0.8 * n)] # use 20% of the data as validation data
 valid_tune_idx = permute_aux[int(0.8 * n):n]
 folds = [(train_tune_idx, valid_tune_idx)]
-opt_params = gpb.tune_pars_TPE_algorithm_optuna(X=X, y=y, search_space=search_space, 
-                                                folds=folds, gp_model=gp_model, metric=metric, tpe_seed=1,
-                                                max_num_boost_round=1000, n_trials=100, early_stopping_rounds=20)
+opt_params = gpb.tune_pars_TPE_algorithm_optuna(search_space=search_space, n_trials=100, 
+                                                X=X, y=y, gp_model=gp_model,
+                                                max_num_boost_round=1000, early_stopping_rounds=20, 
+                                                folds=folds, metric=metric,
+                                                cv_seed=4, tpe_seed=1)
 
 #--------------------Choosing tuning parameters using random grid search----------------
 # Define parameter search grid
 # Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
 param_grid = { 'learning_rate': [0.001, 0.01, 0.1, 1, 10], 
               'min_data_in_leaf': [1, 10, 100, 1000],
-              'max_depth': [-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., 'max_depth': [-1, 1, 2, 3, 5, 10]
+              'max_depth': [-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., 'max_depth': [-1, 1, 2, 3, 5, 10]
               'num_leaves': 2**np.arange(1,10),
               'lambda_l2': [0, 1, 10, 100],
               'max_bin': [250, 500, 1000, np.min([10000,n])],
               'line_search_step_length': [True, False]}
 other_params = {'verbose': 0} # avoid trace information when training models
-# Define metric
-metric = "mse"
+metric = "mse" # Define metric
 if likelihood in ("bernoulli_probit", "bernoulli_logit"):
     metric = "binary_logloss"
+# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
 gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
 data_train = gpb.Dataset(data=X, label=y)
-# Run parameter optimization using random grid search and 4-fold CV
+# Run parameter optimization using random grid search and k-fold CV
 # Note: deterministic grid search can be done by setting 'num_try_random=None'
 opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
-                                             num_try_random=100, nfold=4, seed=1000,
                                              train_set=data_train, gp_model=gp_model,
-                                             use_gp_model_for_validation=True, verbose_eval=1,
+                                             num_try_random=100, nfold=5, 
                                              num_boost_round=1000, early_stopping_rounds=20,
-                                             metric=metric)                            
+                                             verbose_eval=1, metric=metric, seed=4)                           
 print("Best parameters: " + str(opt_params['best_params']))
 print("Best number of iterations: " + str(opt_params['best_iter']))
 print("Best score: " + str(opt_params['best_score']))
@@ -218,19 +220,18 @@ def simulate_response_variable(lp, rand_eff, likelihood):
 valid_tune_idx = permute_aux[int(0.8 * n):n]
 folds = [(train_tune_idx, valid_tune_idx)]
 opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
-                                             num_try_random=100, folds=folds, seed=1000,
                                              train_set=data_train, gp_model=gp_model,
-                                             use_gp_model_for_validation=True, verbose_eval=1,
+                                             num_try_random=100, folds=folds, 
                                              num_boost_round=1000, early_stopping_rounds=20,
-                                             metric=metric)
-  
+                                             verbose_eval=1, metric=metric, seed=4)
+
+
 #--------------------Cross-validation for determining number of iterations----------------
 gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
 data_train = gpb.Dataset(data=X, label=y)
-cvbst = gpb.cv(params=params, train_set=data_train,
-               gp_model=gp_model, use_gp_model_for_validation=True,
+cvbst = gpb.cv(params=params, train_set=data_train, gp_model=gp_model, 
                num_boost_round=1000, early_stopping_rounds=20,
-               nfold=4, verbose_eval=True, show_stdv=False, seed=1)
+               nfold=5, verbose_eval=True, show_stdv=False, seed=1)
 metric_name = list(cvbst.keys())[0]
 print("Best number of iterations: " + str(np.argmin(cvbst[metric_name]) + 1))
 
@@ -246,8 +247,7 @@ def simulate_response_variable(lp, rand_eff, likelihood):
 evals_result = {}  # record eval results for plotting
 bst = gpb.train(params=params, train_set=data_train, num_boost_round=1000,
                 gp_model=gp_model, valid_sets=data_eval, 
-                early_stopping_rounds=20, use_gp_model_for_validation=True,
-                evals_result=evals_result)
+                early_stopping_rounds=20, evals_result=evals_result)
 gpb.plot_metric(evals_result, figsize=(10, 5))# plot validation scores
 plt.show(block=False)
 
@@ -258,8 +258,8 @@ def simulate_response_variable(lp, rand_eff, likelihood):
     params_newton['learning_rate'] = 0.1
     evals_result = {}  # record eval results for plotting
     bst = gpb.train(params=params_newton, train_set=data_train, num_boost_round=1000,
-                    gp_model=gp_model, valid_sets=data_eval, early_stopping_rounds=5,
-                    use_gp_model_for_validation=True, evals_result=evals_result)
+                    gp_model=gp_model, valid_sets=data_eval, early_stopping_rounds=20,
+                    evals_result=evals_result)
     gpb.plot_metric(evals_result, figsize=(10, 5))# plot validation scores
 
 #--------------------Model interpretation----------------
diff --git a/examples/python-guide/generalized_linear_Gaussian_process_mixed_effects_models.py b/examples/python-guide/generalized_linear_Gaussian_process_mixed_effects_models.py
@@ -419,7 +419,7 @@ def simulate_response_variable(lp, rand_eff, likelihood):
 #   -> covariance parameters estimates can have high variance
 
 # Predict latent GP at training data locations (=smoothing)
-GP_smooth = gp_model.predict_training_data_random_effects(predict_var = True) # predict_var = True gives uncertainty for random effect predictions
+GP_smooth = gp_model.predict_training_data_random_effects(predict_var = False) # predict_var = True gives uncertainty for random effect predictions
 # Compare true and predicted random effects
 plt.scatter(b_train, GP_smooth['GP'], label="Intercept GP", alpha=0.5)
 plt.scatter(b2, GP_smooth['GP_rand_coef_nb_1'], label="1. random coef. GP", alpha=0.5)