tidymodels
diff --git a/‎.github/workflows/R-CMD-check.yaml‎
Lines changed: 20 additions & 0 deletions b/‎.github/workflows/R-CMD-check.yaml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 5 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎NEWS.md‎
Lines changed: 2 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/catboost.R‎
Lines changed: 222 additions & 0 deletions b/‎R/catboost.R‎
Lines changed: 222 additions & 0 deletions
@@ -39,6 +39,7 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       R_KEEP_PKG_SOURCE: yes
+      GHA_OS: ${{ matrix.config.os }}
 
     steps:
       - uses: actions/checkout@v4
@@ -56,6 +57,25 @@ jobs:
           extra-packages: any::rcmdcheck
           needs: check
 
+      - name: Install catboost
+        run: |
+          install.packages('remotes')
+          os <- Sys.getenv("GHA_OS")
+          if (os == "macos-latest") {
+            name <- "Darwin"
+          } else if (os == "windows-latest") {
+            name <- "Windows"
+          } else {
+            name <- "Linux"
+          }
+          url <- paste0(
+            'https://github.com/catboost/catboost/releases/download/v1.2.8/catboost-R-', 
+            name,
+            '-1.2.8.tgz'
+          )
+          remotes::install_url(url, INSTALL_opts = c("--no-multiarch", "--no-test-load", "--no-staged-install"))
+        shell: Rscript {0}
+
       - uses: r-lib/actions/check-r-package@v2
         with:
           upload-snapshots: true
 
@@ -2,10 +2,15 @@
 
 S3method(multi_predict,"_lgb.Booster")
 export("%>%")
+export(predict_catboost_classification_class)
+export(predict_catboost_classification_prob)
+export(predict_catboost_classification_raw)
+export(predict_catboost_regression_numeric)
 export(predict_lightgbm_classification_class)
 export(predict_lightgbm_classification_prob)
 export(predict_lightgbm_classification_raw)
 export(predict_lightgbm_regression_numeric)
+export(train_catboost)
 export(train_lightgbm)
 import(rlang)
 importFrom(dials,min_n)
 
@@ -8,6 +8,8 @@
 
 * Fixed bug where `num_threads` argument were ignored for lightgbm engine (#105).
 
+* Added catboost engine to `boost_tree()` (#70).
+
 # bonsai 0.3.2
 
 * Resolves a test failure ahead of an upcoming parsnip release (#95).
 
@@ -0,0 +1,222 @@
+#' Boosted trees with catboost
+#'
+#' `train_catboost` is a wrapper for `catboost` tree-based models
+#' where all of the model arguments are in the main function.
+#'
+#' This is an internal function, not meant to be directly called by the user.
+#'
+#' @param x A data frame of predictors.
+#' @param y A vector (factor or numeric) or matrix (numeric) of outcome data.
+#' @param weights A numeric vector of sample weights, defaults to `NULL`.
+#' @param iterations The maximum number of trees that can be built when solving
+#' machine learning problems. Default to 1000.
+#' @param learning_rate A positive numeric value for the learning rate. Defaults
+#' to 0.03.
+#' @param depth An integer for the depth of the trees. Default to 6.
+#' @param l2_leaf_reg A numeric value for the L2 regularization coefficient.
+#' Used for leaf value calculation. Defaults to 3.
+#' @param random_strength The amount of randomness to use for scoring splits
+#' when the tree structure is selected. Use this parameter to avoid overfitting
+#' the model. Defaults to 1.
+#' @param bagging_temperature A numeric value, controls intensity of Bayesian
+#' bagging. The higher the temperature the more aggressive bagging is. Defaults
+#' to 1.
+#' @param rsm A numeric value between 0 and 1, random subspace method. The
+#' percentage of features to use at each iteration of building trees. At each
+#' iteration, features are selected over again at random. Defaults to 1.
+#' @param quiet A logical; should logging by [catboost::catboost.train()] be
+#' muted?
+#' @param ... Other options to pass to [catboost::catboost.train()]. Arguments
+#' will be correctly routed to the `param` argument, or as a main argument,
+#' depending on their name.
+#'
+#' @source \url{https://catboost.ai/docs/en/references/training-parameters/}.
+#'
+#' @return A fitted `catboost.Model` object.
+#' @keywords internal
+#' @export
+train_catboost <- function(
+  x,
+  y,
+  weights = NULL,
+  iterations = 1000,
+  learning_rate = 0.03,
+  depth = 6,
+  l2_leaf_reg = 3,
+  random_strength = 1,
+  bagging_temperature = 1,
+  rsm = 1,
+  quiet = TRUE,
+  ...
+) {
+  force(x)
+  force(y)
+
+  call <- call2("fit")
+
+  check_number_whole(iterations, call = call)
+  check_number_decimal(learning_rate, call = call)
+  check_number_whole(depth, call = call)
+  check_number_decimal(l2_leaf_reg, call = call)
+  check_number_decimal(random_strength, call = call)
+  check_number_decimal(bagging_temperature, call = call)
+  check_number_decimal(rsm, call = call)
+  check_bool(quiet, call = call)
+
+  arg_params <- list(
+    iterations = iterations,
+    learning_rate = learning_rate,
+    depth = depth,
+    l2_leaf_reg = l2_leaf_reg,
+    random_strength = random_strength,
+    bagging_temperature = bagging_temperature,
+    rsm = rsm,
+    ...
+  )
+
+  arg_params <- process_loss_function(arg_params, y)
+
+  if (!is.null(arg_params$params) && is.list(arg_params$params)) {
+    cli::cli_warn(c(
+      "Arguments passed in through {.arg params} as a list will be ignored.",
+      "Instead pass the arguments directly to the {.code ...}."
+    ))
+    arg_params$params <- NULL
+  }
+
+  learn_pool <- rlang::call2(
+    "catboost.load_pool",
+    data = x,
+    label = y,
+    weight = weights,
+    .ns = "catboost"
+  )
+  learn_pool <- rlang::eval_tidy(learn_pool, env = rlang::current_env())
+
+  args <- list(
+    learn_pool = learn_pool,
+    params = arg_params
+  )
+
+  call <- rlang::call2("catboost.train", !!!args, .ns = "catboost")
+
+  if (quiet) {
+    junk <- utils::capture.output(
+      res <- rlang::eval_tidy(call, env = rlang::current_env())
+    )
+  } else {
+    res <- rlang::eval_tidy(call, env = rlang::current_env())
+  }
+
+  res
+}
+
+#' Internal functions
+#'
+#' Not intended for direct use.
+#'
+#' @keywords internal
+#' @export
+#' @rdname catboost_helpers
+predict_catboost_regression_numeric <- function(object, new_data, ...) {
+  pool <- rlang::eval_tidy(rlang::call2(
+    "catboost.load_pool",
+    data = new_data,
+    .ns = "catboost"
+  ))
+
+  p <- rlang::eval_tidy(rlang::call2(
+    "catboost.predict",
+    model = object$fit,
+    pool = pool,
+    .ns = "catboost"
+  ))
+  p
+}
+
+#' @keywords internal
+#' @export
+#' @rdname catboost_helpers
+predict_catboost_classification_class <- function(object, new_data, ...) {
+  pool <- rlang::eval_tidy(rlang::call2(
+    "catboost.load_pool",
+    data = new_data,
+    .ns = "catboost"
+  ))
+
+  p <- rlang::eval_tidy(rlang::call2(
+    "catboost.predict",
+    model = object$fit,
+    pool = pool,
+    prediction_type = "Class",
+    .ns = "catboost"
+  ))
+
+  object$lvl[p + 1]
+}
+
+#' @keywords internal
+#' @export
+#' @rdname catboost_helpers
+predict_catboost_classification_prob <- function(object, new_data, ...) {
+  pool <- rlang::eval_tidy(rlang::call2(
+    "catboost.load_pool",
+    data = new_data,
+    .ns = "catboost"
+  ))
+
+  p <- rlang::eval_tidy(rlang::call2(
+    "catboost.predict",
+    model = object$fit,
+    pool = pool,
+    prediction_type = "Probability",
+    .ns = "catboost"
+  ))
+
+  if (is.vector(p)) {
+    p <- tibble::tibble(p1 = 1 - p, p2 = p)
+  }
+
+  colnames(p) <- object$lvl
+
+  tibble::as_tibble(p)
+}
+
+#' @keywords internal
+#' @export
+#' @rdname catboost_helpers
+predict_catboost_classification_raw <- function(object, new_data, ...) {
+  pool <- rlang::eval_tidy(rlang::call2(
+    "catboost.load_pool",
+    data = new_data,
+    .ns = "catboost"
+  ))
+
+  p <- rlang::eval_tidy(rlang::call2(
+    "catboost.predict",
+    model = object$fit,
+    pool = pool,
+    .ns = "catboost"
+  ))
+  p
+}
+
+# https://catboost.ai/docs/en/concepts/loss-functions
+process_loss_function <- function(args, y) {
+  lvl <- levels(y)
+  lvls <- length(lvl)
+  # set the "loss_function" param argument, clear it out from main args
+  if (!any(names(args) %in% c("loss_function"))) {
+    if (is.numeric(y)) {
+      args$loss_function <- "RMSE"
+    } else {
+      if (lvls == 2) {
+        args$loss_function <- "Logloss"
+      } else {
+        args$loss_function <- "MultiClass"
+      }
+    }
+  }
+
+  args
+}