Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 46 additions & 8 deletions R/Lrnr_ranger.R
Original file line number Diff line number Diff line change
Expand Up @@ -118,28 +118,66 @@ Lrnr_ranger <- R6Class(
if (task$has_node("weights")) {
args$case.weights <- task$weights
}
data_in <- cbind(task$Y, task$X)
# Preserve raw data frame
Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
data_in <- cbind(task$Y, Xdf)
colnames(data_in)[1] <- task$nodes$outcome
args$data <- data_in
args$dependent.variable.name <- task$nodes$outcome
args$probability <- task$outcome_type$type == "categorical"
# For binomial and categorical tasks, downstream sl3 expects probabilities.
if (is.null(args$probability)) {
args$probability <- task$outcome_type$type %in% c("binomial", "categorical")
}
fit_object <- call_with_args(ranger::ranger, args)
return(fit_object)
},
.predict = function(task) {

# Preserve raw data frame
Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)

# extract numeric predictions from custom class ranger.prediction
predictions <- stats::predict(
pred_obj <- stats::predict(
private$.fit_object,
data = task$X,
data = Xdf,
type = "response",
num.threads = self$params$num.threads
)
pred_raw <- pred_obj$predictions

predictions <- predictions[[1]]
outcome_type <- private$.training_outcome_type$type
if (outcome_type == "categorical") {
# pack K-class probabilities in a single column
predictions <- pack_predictions(pred_raw)
} else if (outcome_type == "binomial") {
# Return P(Y = max level), consistent with Variable_Type$format
# and other binomial learners in sl3.
if (is.matrix(pred_raw)) {
levs <- private$.training_outcome_type$levels
target <- if (!is.null(levs) && length(levs) > 0) {
as.character(levs[[length(levs)]])
} else {
colnames(pred_raw)[ncol(pred_raw)]
}

if (private$.training_outcome_type$type == "categorical") {
# pack predictions in a single column
predictions <- pack_predictions(predictions)
if (!is.null(colnames(pred_raw)) && target %in% colnames(pred_raw)) {
predictions <- as.numeric(pred_raw[, target])
} else {
predictions <- as.numeric(pred_raw[, ncol(pred_raw)])
}
} else if (is.factor(pred_raw) || is.character(pred_raw)) {
levs <- private$.training_outcome_type$levels
target <- if (!is.null(levs) && length(levs) > 0) {
as.character(levs[[length(levs)]])
} else {
as.character(sort(unique(pred_raw))[length(unique(pred_raw))])
}
predictions <- as.numeric(as.character(pred_raw) == target)
} else {
predictions <- as.numeric(pred_raw)
}
} else {
predictions <- as.numeric(pred_raw)
}
return(predictions)
},
Expand Down
180 changes: 91 additions & 89 deletions R/Lrnr_xgboost.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,121 +80,123 @@ Lrnr_xgboost <- R6Class(
"offset", "importance"
),
.train = function(task) {
# Safe helper for %||%
`%||%` <- function(a, b) if (!is.null(a)) a else b

args <- self$params


# verbosity
verbose <- args$verbose
if (is.null(verbose)) {
verbose <- getOption("sl3.verbose")
}
if (is.null(verbose)) verbose <- getOption("sl3.verbose")
args$verbose <- as.integer(verbose)

# set up outcome
# outcome
outcome_type <- self$get_outcome_type(task)
Y <- outcome_type$format(task$Y)
if (outcome_type$type == "categorical") {
Y <- as.numeric(Y) - 1
if (outcome_type$type == "categorical") Y <- as.numeric(Y) - 1L

# raw covariates, keep factors intact
Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
Comment on lines +98 to +99
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The existing tests in test-xgboost.R compare predictions with the native xgboost library using as.matrix(task$X). However, with expand_factors=FALSE, the new code works with raw data frames containing factors. This will cause the existing tests to fail because the test comparisons still use the matrix-based approach while the wrapper now uses data frames with factors.

Copilot uses AI. Check for mistakes.

# (optional but recommended) explicit feature types
feat_types <- vapply(Xdf, function(z) {
if (is.factor(z)) "c" else if (is.integer(z)) "int"
else if (is.logical(z)) "i" else "float"
}, character(1))
Comment on lines +102 to +105
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The feature_types logic is incomplete. When a column is logical, it returns "i", but there's a missing 'else' clause at the end. This will cause an error because the vapply expects a character(1) result for all branches, but if none of the conditions match, no value is returned.

Copilot uses AI. Check for mistakes.

# DMatrix
dtrain <- try(xgboost::xgb.DMatrix(
data = Xdf, label = Y,
feature_names = colnames(Xdf),
feature_types = feat_types
), silent = TRUE)

if (!inherits(dtrain, "xgb.DMatrix")) {
cls <- vapply(Xdf, function(z) paste(class(z), collapse=","), character(1))
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing space after the comma in the parameter "collapse=",". Should be "collapse = ","" to follow R coding conventions.

Suggested change
cls <- vapply(Xdf, function(z) paste(class(z), collapse=","), character(1))
cls <- vapply(Xdf, function(z) paste(class(z), collapse = ","), character(1))

Copilot uses AI. Check for mistakes.
stop("xgb.DMatrix construction failed. Column classes: ",
paste(sprintf("%s:[%s]", names(cls), cls), collapse="; "))
}

# set up predictor data
Xmat <- as.matrix(task$X)
if (is.integer(Xmat)) {
Xmat[, 1] <- as.numeric(Xmat[, 1])
}
if (nrow(Xmat) != nrow(task$X) & ncol(Xmat) == nrow(task$X)) {
Xmat <- t(Xmat)
}
args$data <- try(xgboost::xgb.DMatrix(Xmat, label = Y), silent = TRUE)

# specify weights

# weights
if (task$has_node("weights")) {
try(xgboost::setinfo(args$data, "weight", task$weights), silent = TRUE)
xgboost::setinfo(dtrain, "weight", task$weights)
}

# specify offset

# offset (base_margin)
link_fun <- NULL
if (task$has_node("offset")) {
if (outcome_type$type == "categorical") {
# TODO: fix
stop("offsets not yet supported for outcome_type='categorical'")
}
family <- outcome_type$glm_family(return_object = TRUE)
link_fun <- args$family$linkfun
link_fun <- family$linkfun
offset <- task$offset_transformed(link_fun)
try(xgboost::setinfo(args$data, "base_margin", offset), silent = TRUE)
} else {
link_fun <- NULL
xgboost::setinfo(dtrain, "base_margin", offset)
}

# specify objective if it's NULL to avoid xgb warnings
if (is.null(args$objective)) {
if (outcome_type$type == "binomial") {
args$objective <- "binary:logistic"
args$eval_metric <- "logloss"

# ----- xgboost arguments: use params + evals -----
nrounds <- if (!is.null(args$nrounds)) args$nrounds else 20L
params <- if (!is.null(args$params)) args$params else list()
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parameter extraction from args is problematic. The code extracts nrounds and params separately from args, but this assumes users will pass a nested params argument. However, based on the documentation and initialize method, users pass parameters directly (e.g., nrounds=20, nthread=1, ...). The old code used call_with_args which handled this properly. The new approach should extract nrounds from args$nrounds, but other xgboost parameters should be collected into params from args (excluding nrounds, verbose, and other sl3-specific parameters).

Suggested change
params <- if (!is.null(args$params)) args$params else list()
# Collect xgboost params from args, excluding sl3-specific ones
sl3_specific <- c("nrounds", "verbose", "params")
params <- args[setdiff(names(args), sl3_specific)]

Copilot uses AI. Check for mistakes.

# set objective/metric if not provided
if (is.null(params$objective)) {
if (outcome_type$type %in% c("binomial")) {
params$objective <- "binary:logistic"
params$eval_metric <- params$eval_metric %||% "logloss"
} else if (outcome_type$type == "quasibinomial") {
args$objective <- "reg:logistic"
params$objective <- "reg:logistic"
} else if (outcome_type$type == "categorical") {
args$objective <- "multi:softprob"
args$eval_metric <- "mlogloss"
args$num_class <- as.integer(length(outcome_type$levels))
params$objective <- "multi:softprob"
params$eval_metric <- params$eval_metric %||% "mlogloss"
params$num_class <- as.integer(length(outcome_type$levels))
} else {
params$objective <- params$objective %||% "reg:squarederror"
}
}

args$watchlist <- list(train = args$data)
fit_object <- call_with_args(xgboost::xgb.train, args,
keep_all = TRUE,
ignore = "formula"

fit_booster <- xgboost::xgb.train(
data = dtrain,
nrounds = nrounds,
params = params,
evals = list(train = dtrain),
verbose = args$verbose
)
fit_object$training_offset <- task$has_node("offset")
fit_object$link_fun <- link_fun


# DO NOT mutate the booster; wrap it instead
fit_object <- list(
booster = fit_booster,
meta = list(
training_offset = task$has_node("offset"),
link_fun = link_fun
)
)
class(fit_object) <- c("sl3_xgb_fit", "list")
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The custom class "sl3_xgb_fit" is added to the fit_object wrapper, but there's no documentation or explanation of why this custom class is needed or how it should be used. If this is meant to be an internal implementation detail, consider documenting it. If external code might need to handle this class, consider adding S3 methods or documentation.

Copilot uses AI. Check for mistakes.
Comment on lines +166 to +173
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The importance() method expects the fit object to be an xgb.Booster directly (line 69), but the new code returns a custom list wrapper with the booster nested inside. This will break the importance() method. The args$model should be set to fit_object$booster instead of fit_object.

Copilot uses AI. Check for mistakes.

return(fit_object)
},
.predict = function(task = NULL) {
fit_object <- private$.fit_object

# set up test data for prediction
Xmat <- as.matrix(task$X)
if (is.integer(Xmat)) {
Xmat[, 1] <- as.numeric(Xmat[, 1])
}
# order of columns has to be the same in xgboost training and test data
Xmat_ord <- as.matrix(Xmat[, match(fit_object$feature_names, colnames(Xmat))])
if ((nrow(Xmat_ord) != nrow(Xmat)) & (ncol(Xmat_ord) == nrow(Xmat))) {
Xmat_ord <- t(Xmat_ord)
}
stopifnot(nrow(Xmat_ord) == nrow(Xmat))
# convert to xgb.DMatrix
xgb_data <- try(xgboost::xgb.DMatrix(Xmat_ord), silent = TRUE)

# incorporate offset, if it wasspecified in training
if (self$fit_object$training_offset) {
offset <- task$offset_transformed(
self$fit_object$link_fun,
for_prediction = TRUE
)
try(xgboost::setinfo(xgb_data, "base_margin", offset), silent = TRUE)
}

# incorporate ntreelimit, if training model was not a gblinear-based fit
ntreelimit <- 0
if (!is.null(fit_object[["best_ntreelimit"]]) &
!("gblinear" %in% fit_object[["params"]][["booster"]])) {
ntreelimit <- fit_object[["best_ntreelimit"]]
booster <- fit_object$booster
meta <- fit_object$meta

# raw covariates; relevel to training levels
Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
Comment on lines +182 to +183
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment says "relevel to training levels" but no actual releveling is performed. If factor levels in the prediction data differ from training data, this could cause issues with xgboost's categorical feature handling. Consider adding logic to ensure factor levels match those used during training, or update the comment to reflect what the code actually does.

Copilot uses AI. Check for mistakes.

xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE)
if (!inherits(xgb_data, "xgb.DMatrix")) stop("Failed to build DMatrix for prediction.")
Comment on lines +185 to +186
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The prediction DMatrix is constructed without feature_names or feature_types, unlike the training DMatrix. This inconsistency could lead to issues if xgboost expects the same metadata during prediction. Consider adding feature_names and feature_types to ensure consistency with training.

Copilot uses AI. Check for mistakes.

# base_margin if used in training
if (isTRUE(meta$training_offset)) {
offset <- task$offset_transformed(meta$link_fun, for_prediction = TRUE)
xgboost::setinfo(xgb_data, "base_margin", offset)
}

predictions <- rep.int(list(numeric()), 1)
if (nrow(Xmat) > 0) {
# will generally return vector, needs to be put into data.table column
predictions <- stats::predict(
fit_object,
newdata = xgb_data, ntreelimit = ntreelimit, reshape = TRUE
)

if (private$.training_outcome_type$type == "categorical") {
# pack predictions in a single column
predictions <- pack_predictions(predictions)
}

predictions <- stats::predict(booster, newdata = xgb_data, strict_shape=TRUE)
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing space after the comma in the parameter definition. Should be "strict_shape = TRUE" instead of "strict_shape=TRUE" to follow R coding conventions.

Suggested change
predictions <- stats::predict(booster, newdata = xgb_data, strict_shape=TRUE)
predictions <- stats::predict(booster, newdata = xgb_data, strict_shape = TRUE)

Copilot uses AI. Check for mistakes.

if (private$.training_outcome_type$type == "categorical") {
# pack predictions in a single column
predictions <- pack_predictions(predictions)
}

return(predictions)
},
.required_packages = c("xgboost")
Expand Down
19 changes: 17 additions & 2 deletions R/sl3_Task.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,23 @@ sl3_Task <- R6Class(
} else {
# match interaction terms to X
Xmatch <- lapply(int, function(i) {
grep(i, colnames(self$X), value = TRUE)
})
cols <- colnames(self$X)

# detect if 'i' is represented by factor dummies in the design
has_factor_dummies <- any(startsWith(cols, paste0(i, ".")))

if (has_factor_dummies) {
# prefix match for factor dummy columns, anchored
grep(paste0("^", i, "\\."), colnames(self$X), value = TRUE)
Comment on lines +211 to +215
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The startsWith function uses paste0(i, ".") for detection but then uses a different pattern paste0("^", i, "\.") for matching with grep. For consistency and to avoid edge cases, both should use the same escaped pattern. The startsWith function should be: startsWith(cols, paste0(i, ".")) is correct, but it would be clearer if the pattern was defined once and reused.

Suggested change
has_factor_dummies <- any(startsWith(cols, paste0(i, ".")))
if (has_factor_dummies) {
# prefix match for factor dummy columns, anchored
grep(paste0("^", i, "\\."), colnames(self$X), value = TRUE)
pattern <- paste0("^", i, "\\.")
has_factor_dummies <- any(grepl(pattern, cols))
if (has_factor_dummies) {
grep(pattern, colnames(self$X), value = TRUE)

Copilot uses AI. Check for mistakes.
} else if (i %in% cols) {
# exact match for a single numeric (or already-numeric) column
i
} else {
# nothing found; better to fail loud than silently drop an interaction
warning("No matching columns in design matrix for interaction term '", i, "'.")
character(0)
}
})
Xint <- as.list(data.table::as.data.table(t(expand.grid(Xmatch))))

d_Xint <- lapply(Xint, function(Xint) {
Expand Down
Loading