tidymodels
diff --git a/‎NEWS.md‎
Lines changed: 2 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/linear_reg-fit.R‎
Lines changed: 0 additions & 7 deletions b/‎R/linear_reg-fit.R‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎R/mlp-fit.R‎
Lines changed: 40 additions & 10 deletions b/‎R/mlp-fit.R‎
Lines changed: 40 additions & 10 deletions
diff --git a/‎R/schedulers.R‎
Lines changed: 3 additions & 3 deletions b/‎R/schedulers.R‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎README.Rmd‎
Lines changed: 2 additions & 2 deletions b/‎README.Rmd‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 12 additions & 12 deletions b/‎README.md‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎man/brulee-package.Rd‎
Lines changed: 2 additions & 0 deletions b/‎man/brulee-package.Rd‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎man/brulee_linear_reg.Rd‎
Lines changed: 4 additions & 4 deletions b/‎man/brulee_linear_reg.Rd‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎man/brulee_logistic_reg.Rd‎
Lines changed: 1 addition & 1 deletion b/‎man/brulee_logistic_reg.Rd‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎man/brulee_mlp.Rd‎
Lines changed: 9 additions & 2 deletions b/‎man/brulee_mlp.Rd‎
Lines changed: 9 additions & 2 deletions
@@ -2,6 +2,8 @@
 
 * Several learning rate schedulers were added to the modeling functions (#12).
 
+* An `optimizer` was added to [brulee_mlp()], with a new default being LBFGS instead of stochastic gradient descent. 
+
 # brulee 0.1.0
 
 * Modeling functions gained a `mixture` argument for the proportion of L1 penalty that is used. (#50)
 
@@ -24,13 +24,6 @@
 #'
 #' @inheritParams brulee_mlp
 #'
-#' @param optimizer The method used in the optimization procedure. Possible choices
-#'   are 'LBFGS' and 'SGD'. Default is 'LBFGS'.
-#' @param learn_rate A positive number that controls the rapidity that the model
-#' moves along the descent path. Values less that 0.1 are typical.
-#' (`optimizer = "SGD"` only)
-#' @param momentum A positive number usually on `[0.50, 0.99]` for the momentum
-#' parameter in gradient descent.  (`optimizer = "SGD"` only)
 #' @details
 #'
 #' This function fits a linear combination of coefficients and predictors to
 
@@ -37,6 +37,8 @@
 #'  "relu", "elu", "tanh", and "linear". If `hidden_units` is a vector, `activation`
 #'  can be a character vector with length equals to `length(hidden_units)` specifying
 #'  the activation for each hidden layer.
+#' @param optimizer The method used in the optimization procedure. Possible choices
+#'   are 'LBFGS' and 'SGD'. Default is 'LBFGS'.
 #' @param learn_rate A positive number that controls the initial rapidity that
 #' the model moves along the descent path. Values around 0.1 or less are
 #' typical.
@@ -45,7 +47,7 @@
 #' `"none"` (the default), `"decay_time"`, `"decay_expo"`, `"cyclic"` and
 #' `"step"`. See [schedule_decay_time()] for more details.
 #' @param momentum A positive number usually on `[0.50, 0.99]` for the momentum
-#' parameter in gradient descent.
+#' parameter in gradient descent.  (`optimizer = "SGD"` only)
 #' @param dropout The proportion of parameters set to zero.
 #' @param class_weights Numeric class weights (classification only). The value
 #' can be:
@@ -59,7 +61,7 @@
 #' @param validation The proportion of the data randomly assigned to a
 #'  validation set.
 #' @param batch_size An integer for the number of training set points in each
-#'  batch.
+#'  batch. (`optimizer = "SGD"` only)
 #' @param stop_iter A non-negative integer for how many iterations with no
 #' improvement before stopping.
 #' @param verbose A logical that prints out the iteration history.
@@ -239,6 +241,7 @@ brulee_mlp.data.frame <-
            mixture = 0,
            dropout = 0,
            validation = 0.1,
+           optimizer = "LBFGS",
            learn_rate = 0.01,
            rate_schedule = "none",
            momentum = 0.0,
@@ -260,6 +263,7 @@ brulee_mlp.data.frame <-
       mixture = mixture,
       dropout = dropout,
       validation = validation,
+      optimizer = optimizer,
       momentum = momentum,
       batch_size = batch_size,
       class_weights = class_weights,
@@ -282,6 +286,7 @@ brulee_mlp.matrix <- function(x,
                               mixture = 0,
                               dropout = 0,
                               validation = 0.1,
+                              optimizer = "LBFGS",
                               learn_rate = 0.01,
                               rate_schedule = "none",
                               momentum = 0.0,
@@ -304,6 +309,7 @@ brulee_mlp.matrix <- function(x,
     mixture = mixture,
     dropout = dropout,
     validation = validation,
+    optimizer = optimizer,
     batch_size = batch_size,
     class_weights = class_weights,
     stop_iter = stop_iter,
@@ -326,6 +332,7 @@ brulee_mlp.formula <-
            mixture = 0,
            dropout = 0,
            validation = 0.1,
+           optimizer = "LBFGS",
            learn_rate = 0.01,
            rate_schedule = "none",
            momentum = 0.0,
@@ -348,6 +355,7 @@ brulee_mlp.formula <-
       mixture = mixture,
       dropout = dropout,
       validation = validation,
+      optimizer = optimizer,
       batch_size = batch_size,
       class_weights = class_weights,
       stop_iter = stop_iter,
@@ -370,6 +378,7 @@ brulee_mlp.recipe <-
            mixture = 0,
            dropout = 0,
            validation = 0.1,
+           optimizer = "LBFGS",
            learn_rate = 0.01,
            rate_schedule = "none",
            momentum = 0.0,
@@ -392,6 +401,7 @@ brulee_mlp.recipe <-
       mixture = mixture,
       dropout = dropout,
       validation = validation,
+      optimizer = optimizer,
       batch_size = batch_size,
       class_weights = class_weights,
       stop_iter = stop_iter,
@@ -405,7 +415,7 @@ brulee_mlp.recipe <-
 
 brulee_mlp_bridge <- function(processed, epochs, hidden_units, activation,
                               learn_rate, rate_schedule, momentum, penalty,
-                              mixture, dropout, class_weights, validation,
+                              mixture, dropout, class_weights, validation, optimizer,
                               batch_size, stop_iter, verbose, ...) {
   if(!torch::torch_is_installed()) {
     rlang::abort("The torch backend has not been installed; use `torch::install_torch()`.")
@@ -426,6 +436,10 @@ brulee_mlp_bridge <- function(processed, epochs, hidden_units, activation,
     rlang::abort("'activation' must be a single value or a vector with the same length as 'hidden_units'")
   }
 
+  if (optimizer == "LBFGS" & !is.null(batch_size)) {
+   rlang::warn("'batch_size' is only use for the SGD optimizer.")
+  }
+
   check_integer(epochs, single = TRUE, 1, fn = f_nm)
   if (!is.null(batch_size)) {
     if (is.numeric(batch_size) & !is.integer(batch_size)) {
@@ -487,6 +501,7 @@ brulee_mlp_bridge <- function(processed, epochs, hidden_units, activation,
       mixture = mixture,
       dropout = dropout,
       validation = validation,
+      optimizer = optimizer,
       batch_size = batch_size,
       class_weights = class_weights,
       stop_iter = stop_iter,
@@ -555,6 +570,7 @@ mlp_fit_imp <-
            mixture = 0,
            dropout = 0,
            validation = 0.1,
+           optimizer = "LBFGS",
            learn_rate = 0.01,
            rate_schedule = "none",
            momentum = 0.0,
@@ -640,6 +656,17 @@ mlp_fit_imp <-
     model <- mlp_module(ncol(x), hidden_units, activation, dropout, y_dim)
     loss_fn <- make_penalized_loss(loss_fn, model, penalty, mixture)
 
+    # Set the optimizer
+    if (optimizer == "LBFGS") {
+     optimizer <- torch::optim_lbfgs(model$parameters, lr = learn_rate,
+                                     history_size = 5)
+    } else if (optimizer == "SGD") {
+     optimizer <-
+      torch::optim_sgd(model$parameters, lr = learn_rate, momentum = momentum)
+    } else {
+     rlang::abort(paste0("Unknown optimizer '", optimizer, "'"))
+    }
+
     ## ---------------------------------------------------------------------------
 
     loss_prev <- 10^38
@@ -671,14 +698,16 @@ mlp_fit_imp <-
 
       # training loop
       coro::loop(
-        for (batch in dl) {
-          pred <- model(batch$x)
-          loss <- loss_fn(pred, batch$y, class_weights)
-
-          optimizer$zero_grad()
-          loss$backward()
-          optimizer$step()
+       for (batch in dl) {
+        cl <- function() {
+         optimizer$zero_grad()
+         pred <- model(batch$x)
+         loss <- loss_fn(pred, batch$y, class_weights)
+         loss$backward()
+         loss
         }
+        optimizer$step(cl)
+       }
       )
 
       # calculate loss on the full datasets
@@ -750,6 +779,7 @@ mlp_fit_imp <-
        mixture = mixture,
        dropout = dropout,
        validation = validation,
+       optimizer = optimizer,
        batch_size = batch_size,
        momentum = momentum,
        sched = rate_schedule,
 
@@ -24,11 +24,11 @@
 #' The details for how the schedulers change the rates:
 #'
 #' * `schedule_decay_time()`: \eqn{rate(epoch) = initial/(1 + decay \times epoch)}
-#' * `schedule_decay_expo()`: \eqn{initial\exp(-decay \times epoch)}
-#' * `schedule_step()`: \eqn{initial \times reduction^{floor(epoch / steps)}}
+#' * `schedule_decay_expo()`: \eqn{rate(epoch) = initial\exp(-decay \times epoch)}
+#' * `schedule_step()`: \eqn{rate(epoch) = initial \times reduction^{floor(epoch / steps)}}
 #' * `schedule_cyclic()`: \eqn{cycle = floor( 1 + (epoch / 2 / step size) )},
 #'  \eqn{x = abs( ( epoch / step size ) - ( 2 * cycle) + 1 )}, and
-#'  \eqn{rate = initial + ( largest - initial ) * \max( 0, 1 - x)}
+#'  \eqn{rate(epoch) = initial + ( largest - initial ) * \max( 0, 1 - x)}
 #'
 #'
 #' @seealso [brulee_mlp()]
 
@@ -60,7 +60,7 @@ library(yardstick)
 data(bivariate, package = "modeldata")
 set.seed(20)
 nn_log_biv <- brulee_mlp(Class ~ log(A) + log(B), data = bivariate_train, 
-                         epochs = 150, hidden_units = 3, batch_size = 64)
+                         epochs = 150, hidden_units = 3)
 
 # We use the tidymodels semantics to always return a tibble when predicting
 predict(nn_log_biv, bivariate_test, type = "prob") %>% 
@@ -80,7 +80,7 @@ rec <-
 
 set.seed(20)
 nn_rec_biv <- brulee_mlp(rec, data = bivariate_train, 
-                         epochs = 150, hidden_units = 3, batch_size = 64)
+                         epochs = 150, hidden_units = 3)
 
 # A little better
 predict(nn_rec_biv, bivariate_test, type = "prob") %>% 
 
@@ -15,14 +15,14 @@ experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](h
 The R `brulee` package contains several basic modeling functions that
 use the `torch` package infrastructure, such as:
 
--   [neural
-    networks](https://tidymodels.github.io/brulee/reference/brulee_mlp.html)
--   [linear
-    regression](https://tidymodels.github.io/brulee/reference/brulee_linear_reg.html)
--   [logistic
-    regression](https://tidymodels.github.io/brulee/reference/brulee_logistic_reg.html)
--   [multinomial
-    regression](https://tidymodels.github.io/brulee/reference/brulee_multinomial_reg.html)
+- [neural
+  networks](https://tidymodels.github.io/brulee/reference/brulee_mlp.html)
+- [linear
+  regression](https://tidymodels.github.io/brulee/reference/brulee_linear_reg.html)
+- [logistic
+  regression](https://tidymodels.github.io/brulee/reference/brulee_logistic_reg.html)
+- [multinomial
+  regression](https://tidymodels.github.io/brulee/reference/brulee_multinomial_reg.html)
 
 ## Installation
 
@@ -54,7 +54,7 @@ library(yardstick)
 data(bivariate, package = "modeldata")
 set.seed(20)
 nn_log_biv <- brulee_mlp(Class ~ log(A) + log(B), data = bivariate_train, 
-                         epochs = 150, hidden_units = 3, batch_size = 64)
+                         epochs = 150, hidden_units = 3)
 
 # We use the tidymodels semantics to always return a tibble when predicting
 predict(nn_log_biv, bivariate_test, type = "prob") %>% 
@@ -63,7 +63,7 @@ predict(nn_log_biv, bivariate_test, type = "prob") %>%
 #> # A tibble: 1 × 3
 #>   .metric .estimator .estimate
 #>   <chr>   <chr>          <dbl>
-#> 1 roc_auc binary         0.608
+#> 1 roc_auc binary         0.839
 ```
 
 A recipe can also be used if the data require some sort of preprocessing
@@ -79,7 +79,7 @@ rec <-
 
 set.seed(20)
 nn_rec_biv <- brulee_mlp(rec, data = bivariate_train, 
-                         epochs = 150, hidden_units = 3, batch_size = 64)
+                         epochs = 150, hidden_units = 3)
 
 # A little better
 predict(nn_rec_biv, bivariate_test, type = "prob") %>% 
@@ -88,7 +88,7 @@ predict(nn_rec_biv, bivariate_test, type = "prob") %>%
 #> # A tibble: 1 × 3
 #>   .metric .estimator .estimate
 #>   <chr>   <chr>          <dbl>
-#> 1 roc_auc binary         0.865
+#> 1 roc_auc binary         0.866
 ```
 
 ## Code of Conduct