Skip to content

Commit 60bad15

Browse files
committed
[R-package][python-package] changes in demos
1 parent a459585 commit 60bad15

File tree

3 files changed

+92
-55
lines changed

3 files changed

+92
-55
lines changed

R-package/demo/GPBoost_algorithm.R

Lines changed: 63 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -127,54 +127,92 @@ legend(legend=c("True F","Pred F"), "bottomright", bty="n", lwd=3, col=c(2,4))
127127
plot(b1, pred$random_effect_mean, xlab="truth", ylab="predicted",
128128
main="Comparison of true and predicted random effects")
129129

130+
#--------------------Choosing tuning parameters using Bayesian optimization and the 'mlrMBO' R package ----------------
131+
library(mlrMBO)
132+
library(DiceKriging)
133+
library(rgenoud)
134+
source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/R_package_tune_pars_bayesian_optimization.R")# Load required function
135+
# Define search space
136+
# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
137+
search_space <- list("learning_rate" = c(0.001, 10),
138+
"min_data_in_leaf" = c(1, 1000),
139+
"max_depth" = c(-1, -1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
140+
"num_leaves" = c(2, 2^10),
141+
"lambda_l2" = c(0, 100),
142+
"max_bin" = c(63, min(n,10000)),
143+
"line_search_step_length" = c(TRUE, FALSE))
144+
metric = "mse" # Define metric
145+
if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) {
146+
metric = "binary_logloss"
147+
}
148+
# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
149+
gp_model <- GPModel(group_data = group, likelihood = likelihood)
150+
data_train <- gpb.Dataset(data = X, label = y)
151+
# Run parameter optimization using Bayesian optimization and k-fold CV
152+
crit = makeMBOInfillCritCB() # other criterion options: makeMBOInfillCritEI()
153+
opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100,
154+
data = dataset, gp_model = gp_model,
155+
nfold = 5, nrounds = 1000, early_stopping_rounds = 20,
156+
metric = metric, crit = crit,
157+
cv_seed = 4, verbose_eval = 1)
158+
print(paste0("Best parameters: ", paste0(unlist(lapply(seq_along(opt_params$best_params),
159+
function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, y=opt_params$best_params,
160+
n=names(opt_params$best_params))), collapse=", ")))
161+
print(paste0("Best number of iterations: ", opt_params$best_iter))
162+
print(paste0("Best score: ", round(opt_params$best_score, digits=3)))
163+
164+
# Alternatively and faster: using manually defined validation data instead of cross-validation
165+
valid_tune_idx <- sample.int(length(y), as.integer(0.2*length(y))) # use 20% of the data as validation data
166+
folds <- list(valid_tune_idx)
167+
opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100,
168+
data = dataset, gp_model = gp_model,
169+
folds = folds, nrounds = 1000, early_stopping_rounds = 20,
170+
metric = metric, crit = crit,
171+
cv_seed = 4, verbose_eval = 1)
172+
130173
#--------------------Choosing tuning parameters using random grid search----------------
131174
param_grid <- list("learning_rate" = c(0.001, 0.01, 0.1, 1, 10),
132175
"min_data_in_leaf" = c(1, 10, 100, 1000),
133-
"max_depth" = c(-1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
176+
"max_depth" = c(-1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
134177
"num_leaves" = 2^(1:10),
135178
"lambda_l2" = c(0, 1, 10, 100),
136179
"max_bin" = c(250, 500, 1000, min(n,10000)),
137180
"line_search_step_length" = c(TRUE, FALSE))
138-
other_params <- list(verbose = 0) # avoid trace information when training models
139-
# Define metric
140-
metric = "mse"
181+
metric = "mse" # Define metric
141182
if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) {
142183
metric = "binary_logloss"
143184
}
144185
# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
145186
gp_model <- GPModel(group_data = group, likelihood = likelihood)
146187
data_train <- gpb.Dataset(data = X, label = y)
147188
set.seed(1)
148-
opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, params = other_params,
149-
num_try_random = 100, nfold = 4,
189+
# Run parameter optimization using random grid search and k-fold CV
190+
# Note: deterministic grid search can be done by setting 'num_try_random=NULL'
191+
opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid,
150192
data = data_train, gp_model = gp_model,
151-
use_gp_model_for_validation = TRUE, verbose_eval = 1,
193+
num_try_random = 100, nfold = 5,
152194
nrounds = 1000, early_stopping_rounds = 20,
153-
metric = metric)
154-
print(paste0("Best parameters: ",
155-
paste0(unlist(lapply(seq_along(opt_params$best_params),
156-
function(y, n, i) { paste0(n[[i]],": ", y[[i]]) },
157-
y=opt_params$best_params,
195+
verbose_eval = 1, metric = metric, cv_seed = 4)
196+
print(paste0("Best parameters: ", paste0(unlist(lapply(seq_along(opt_params$best_params),
197+
function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, y=opt_params$best_params,
158198
n=names(opt_params$best_params))), collapse=", ")))
159199
print(paste0("Best number of iterations: ", opt_params$best_iter))
160200
print(paste0("Best score: ", round(opt_params$best_score, digits=3)))
161201

162202
# Alternatively and faster: using manually defined validation data instead of cross-validation
163203
valid_tune_idx <- sample.int(length(y), as.integer(0.2*length(y))) # use 20% of the data as validation data
164204
folds <- list(valid_tune_idx)
165-
opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, params = other_params,
166-
num_try_random = 100, folds = folds,
167-
data = dataset, gp_model = gp_model,
168-
use_gp_model_for_validation = TRUE, verbose_eval = 1,
205+
opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid,
206+
data = data_train, gp_model = gp_model,
207+
num_try_random = 5, folds = folds,
169208
nrounds = 1000, early_stopping_rounds = 20,
170-
metric = metric)
209+
verbose_eval = 1, metric = metric, cv_seed = 4)
171210

172211
#--------------------Cross-validation for determining number of iterations----------------
173212
gp_model <- GPModel(group_data = group, likelihood = likelihood)
174213
dataset <- gpb.Dataset(data = X, label = y)
175-
bst <- gpb.cv(data = dataset, gp_model = gp_model,
176-
use_gp_model_for_validation = TRUE, params = params,
177-
nrounds = 1000, nfold = 4, early_stopping_rounds = 20)
214+
bst <- gpb.cv(data = dataset, gp_model = gp_model, params = params,
215+
nrounds = 1000, nfold = 5, early_stopping_rounds = 20)
178216
print(paste0("Optimal number of iterations: ", bst$best_iter))
179217

180218
#--------------------Using a validation set for finding number of iterations----------------
@@ -188,7 +226,7 @@ gp_model <- GPModel(group_data = group[train_ind], likelihood = likelihood)
188226
gp_model$set_prediction_data(group_data_pred = group[-train_ind])
189227
bst <- gpb.train(data = dtrain, gp_model = gp_model, nrounds = 1000,
190228
params = params, verbose = 1, valids = valids,
191-
early_stopping_rounds = 20, use_gp_model_for_validation = TRUE)
229+
early_stopping_rounds = 20)
192230
print(paste0("Optimal number of iterations: ", bst$best_iter,
193231
", best test error: ", bst$best_score))
194232
# Plot validation error
@@ -204,7 +242,7 @@ if (likelihood == "gaussian") {
204242
params_newton$learning_rate <- 0.1
205243
bst <- gpb.train(data = dtrain, gp_model = gp_model, nrounds = 1000,
206244
params = params_newton, verbose = 1, valids = valids,
207-
early_stopping_rounds = 10, use_gp_model_for_validation = TRUE,
245+
early_stopping_rounds = 20,
208246
leaves_newton_update = TRUE)
209247
print(paste0("Optimal number of iterations: ", bst$best_iter,
210248
", best test error: ", bst$best_score))
@@ -245,7 +283,7 @@ shap_long <- shap.prep(bst, X_train = X)
245283
shap.plot.dependence(data_long = shap_long, x = "Covariate_1",
246284
color_feature = "Covariate_2", smooth = FALSE)
247285
# SHAP interaction values
248-
source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/unify_gpboost_treeshap.R")# Load required function
286+
source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/R_package_unify_gpboost_treeshap.R")# Load required function
249287
library(treeshap)
250288
library(shapviz)
251289
unified_bst <- gpboost_unify_treeshap(bst, X)
@@ -286,9 +324,8 @@ gp_model <- GPModel(group_data = group, likelihood = likelihood)
286324
dataset <- gpb.Dataset(X, label = y)
287325
# Stage 1: run cross-validation to (i) determine to optimal number of iterations
288326
# and (ii) to estimate the GPModel on the out-of-sample data
289-
cvbst <- gpb.cv(data = dataset, gp_model = gp_model,
290-
use_gp_model_for_validation = TRUE, params = params,
291-
nrounds = 1000, nfold = 4, early_stopping_rounds = 5,
327+
cvbst <- gpb.cv(data = dataset, gp_model = gp_model, params = params,
328+
nrounds = 1000, nfold = 5, early_stopping_rounds = 20,
292329
fit_GP_cov_pars_OOS = TRUE, verbose = 0)
293330
print(paste0("Optimal number of iterations: ", cvbst$best_iter))
294331
# Fitted model (note: ideally, one would have to find the optimal combination of

examples/python-guide/GPBoost_algorithm.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -153,21 +153,22 @@ def simulate_response_variable(lp, rand_eff, likelihood):
153153
# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
154154
search_space = { 'learning_rate': [0.001, 10],
155155
'min_data_in_leaf': [1, 1000],
156-
'max_depth': [-1,-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., 'max_depth': [-1,10]
156+
'max_depth': [-1,-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., 'max_depth': [-1,10]
157157
'num_leaves': [2, 1024],
158158
'lambda_l2': [0, 100],
159159
'max_bin': [63, np.min([10000,n])],
160160
'line_search_step_length': [True, False] }
161-
# Define metric
162-
metric = "mse"
161+
metric = "mse" # Define metric
163162
if likelihood in ("bernoulli_probit", "bernoulli_logit"):
164163
metric = "binary_logloss"
165164
# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
166165
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
167-
# Run parameter optimization using the TPE algorithm and 4-fold CV
168-
opt_params = gpb.tune_pars_TPE_algorithm_optuna(X=X, y=y, search_space=search_space,
169-
nfold=4, gp_model=gp_model, metric=metric, tpe_seed=1,
170-
max_num_boost_round=1000, n_trials=100, early_stopping_rounds=20)
166+
# Run parameter optimization using the TPE algorithm and k-fold CV
167+
opt_params = gpb.tune_pars_TPE_algorithm_optuna(search_space=search_space, n_trials=100,
168+
X=X, y=y, gp_model=gp_model,
169+
max_num_boost_round=1000, early_stopping_rounds=20,
170+
nfold=5, metric=metric,
171+
cv_seed=4, tpe_seed=1)
171172
print("Best parameters: " + str(opt_params['best_params']))
172173
print("Best number of iterations: " + str(opt_params['best_iter']))
173174
print("Best score: " + str(opt_params['best_score']))
@@ -178,35 +179,36 @@ def simulate_response_variable(lp, rand_eff, likelihood):
178179
train_tune_idx = permute_aux[0:int(0.8 * n)] # use 20% of the data as validation data
179180
valid_tune_idx = permute_aux[int(0.8 * n):n]
180181
folds = [(train_tune_idx, valid_tune_idx)]
181-
opt_params = gpb.tune_pars_TPE_algorithm_optuna(X=X, y=y, search_space=search_space,
182-
folds=folds, gp_model=gp_model, metric=metric, tpe_seed=1,
183-
max_num_boost_round=1000, n_trials=100, early_stopping_rounds=20)
182+
opt_params = gpb.tune_pars_TPE_algorithm_optuna(search_space=search_space, n_trials=100,
183+
X=X, y=y, gp_model=gp_model,
184+
max_num_boost_round=1000, early_stopping_rounds=20,
185+
folds=folds, metric=metric,
186+
cv_seed=4, tpe_seed=1)
184187

185188
#--------------------Choosing tuning parameters using random grid search----------------
186189
# Define parameter search grid
187190
# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
188191
param_grid = { 'learning_rate': [0.001, 0.01, 0.1, 1, 10],
189192
'min_data_in_leaf': [1, 10, 100, 1000],
190-
'max_depth': [-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., 'max_depth': [-1, 1, 2, 3, 5, 10]
193+
'max_depth': [-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., 'max_depth': [-1, 1, 2, 3, 5, 10]
191194
'num_leaves': 2**np.arange(1,10),
192195
'lambda_l2': [0, 1, 10, 100],
193196
'max_bin': [250, 500, 1000, np.min([10000,n])],
194197
'line_search_step_length': [True, False]}
195198
other_params = {'verbose': 0} # avoid trace information when training models
196-
# Define metric
197-
metric = "mse"
199+
metric = "mse" # Define metric
198200
if likelihood in ("bernoulli_probit", "bernoulli_logit"):
199201
metric = "binary_logloss"
202+
# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
200203
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
201204
data_train = gpb.Dataset(data=X, label=y)
202-
# Run parameter optimization using random grid search and 4-fold CV
205+
# Run parameter optimization using random grid search and k-fold CV
203206
# Note: deterministic grid search can be done by setting 'num_try_random=None'
204207
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
205-
num_try_random=100, nfold=4, seed=1000,
206208
train_set=data_train, gp_model=gp_model,
207-
use_gp_model_for_validation=True, verbose_eval=1,
209+
num_try_random=100, nfold=5,
208210
num_boost_round=1000, early_stopping_rounds=20,
209-
metric=metric)
211+
verbose_eval=1, metric=metric, seed=4)
210212
print("Best parameters: " + str(opt_params['best_params']))
211213
print("Best number of iterations: " + str(opt_params['best_iter']))
212214
print("Best score: " + str(opt_params['best_score']))
@@ -218,19 +220,18 @@ def simulate_response_variable(lp, rand_eff, likelihood):
218220
valid_tune_idx = permute_aux[int(0.8 * n):n]
219221
folds = [(train_tune_idx, valid_tune_idx)]
220222
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
221-
num_try_random=100, folds=folds, seed=1000,
222223
train_set=data_train, gp_model=gp_model,
223-
use_gp_model_for_validation=True, verbose_eval=1,
224+
num_try_random=100, folds=folds,
224225
num_boost_round=1000, early_stopping_rounds=20,
225-
metric=metric)
226-
226+
verbose_eval=1, metric=metric, seed=4)
227+
228+
227229
#--------------------Cross-validation for determining number of iterations----------------
228230
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
229231
data_train = gpb.Dataset(data=X, label=y)
230-
cvbst = gpb.cv(params=params, train_set=data_train,
231-
gp_model=gp_model, use_gp_model_for_validation=True,
232+
cvbst = gpb.cv(params=params, train_set=data_train, gp_model=gp_model,
232233
num_boost_round=1000, early_stopping_rounds=20,
233-
nfold=4, verbose_eval=True, show_stdv=False, seed=1)
234+
nfold=5, verbose_eval=True, show_stdv=False, seed=1)
234235
metric_name = list(cvbst.keys())[0]
235236
print("Best number of iterations: " + str(np.argmin(cvbst[metric_name]) + 1))
236237

@@ -246,8 +247,7 @@ def simulate_response_variable(lp, rand_eff, likelihood):
246247
evals_result = {} # record eval results for plotting
247248
bst = gpb.train(params=params, train_set=data_train, num_boost_round=1000,
248249
gp_model=gp_model, valid_sets=data_eval,
249-
early_stopping_rounds=20, use_gp_model_for_validation=True,
250-
evals_result=evals_result)
250+
early_stopping_rounds=20, evals_result=evals_result)
251251
gpb.plot_metric(evals_result, figsize=(10, 5))# plot validation scores
252252
plt.show(block=False)
253253

@@ -258,8 +258,8 @@ def simulate_response_variable(lp, rand_eff, likelihood):
258258
params_newton['learning_rate'] = 0.1
259259
evals_result = {} # record eval results for plotting
260260
bst = gpb.train(params=params_newton, train_set=data_train, num_boost_round=1000,
261-
gp_model=gp_model, valid_sets=data_eval, early_stopping_rounds=5,
262-
use_gp_model_for_validation=True, evals_result=evals_result)
261+
gp_model=gp_model, valid_sets=data_eval, early_stopping_rounds=20,
262+
evals_result=evals_result)
263263
gpb.plot_metric(evals_result, figsize=(10, 5))# plot validation scores
264264

265265
#--------------------Model interpretation----------------

examples/python-guide/generalized_linear_Gaussian_process_mixed_effects_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ def simulate_response_variable(lp, rand_eff, likelihood):
419419
# -> covariance parameters estimates can have high variance
420420

421421
# Predict latent GP at training data locations (=smoothing)
422-
GP_smooth = gp_model.predict_training_data_random_effects(predict_var = True) # predict_var = True gives uncertainty for random effect predictions
422+
GP_smooth = gp_model.predict_training_data_random_effects(predict_var = False) # predict_var = True gives uncertainty for random effect predictions
423423
# Compare true and predicted random effects
424424
plt.scatter(b_train, GP_smooth['GP'], label="Intercept GP", alpha=0.5)
425425
plt.scatter(b2, GP_smooth['GP_rand_coef_nb_1'], label="1. random coef. GP", alpha=0.5)

0 commit comments

Comments
 (0)