Merge branch 'main' of https://github.com/rstudio/keras into main

t-kalinowski · t-kalinowski · commit c25aebfab179 · 2022-06-01T08:27:36.000-04:00
diff --git a/NAMESPACE b/NAMESPACE
@@ -79,6 +79,7 @@ export(array_reshape)
 export(as_tensor)
 export(backend)
 export(bidirectional)
+export(callback_backup_and_restore)
 export(callback_csv_logger)
 export(callback_early_stopping)
 export(callback_lambda)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # keras (development version)
 
+- New `callback_backup_and_restore()`, for resuming an interrupted `fit()` call.
+
 # keras 2.9.0
 
 - New functions for constructing custom keras subclasses:
@@ -14,7 +16,7 @@
   should be an active binding (i.e., decorated with Python's `@property`).
   `mark_active()` can be used in the `new_*_class` family of class constructors
   as well as `%py_class%`.
-  
+
 -  `r_to_py()` method for R6 classes and `%py_class%` gain support for
   `private` fields and methods. Any R objects stored in `private` will only be
   available to methods, and will not be converted to Python.
@@ -32,7 +34,7 @@
 
 - New L2 unit normilization layer: `layer_unit_normalization()`.
 
-- New `regularizer_orthogonal`, a regularizer that encourages 
+- New `regularizer_orthogonal`, a regularizer that encourages
   orthogonality between the rows (or columns) or a weight matrix.
 
 - New `zip_lists()` function for transposing lists, optionally matching by name.
@@ -90,16 +92,16 @@
 
 - KerasTensor objects (e.g, returned by `layer_input()`) now inherit S3 methods
   for `"tensorflow.tensor"`.
-  
-- `plot.keras_training_history()` no longer issues message 
+
+- `plot.keras_training_history()` no longer issues message
   ``` `geom_smooth()` using formula 'y ~ x' ``` when `method = "ggplot2"`.
-  
-- `print` and related methods for models (`format`, `summary`) now accept 
+
+- `print` and related methods for models (`format`, `summary`) now accept
    a `width` argument.
 
-- `evaluate()`, `fit()`, and `predict()` methods for keras Models now default 
-  to `verbose = "auto"`, with verbosity adjusted appropriately based on calls to 
-  `keras$utils$disable_interactive_logging()`, and contexts like 
+- `evaluate()`, `fit()`, and `predict()` methods for keras Models now default
+  to `verbose = "auto"`, with verbosity adjusted appropriately based on calls to
+  `keras$utils$disable_interactive_logging()`, and contexts like
   `ParameterServerStrategy`.
 
 - `install_keras()` now accepts `version = "release-cpu"` as a valid specification.
diff --git a/R/callbacks.R b/R/callbacks.R
@@ -119,6 +119,63 @@ callback_model_checkpoint <- function(filepath, monitor = "val_loss", verbose =
 }
 
 
+#' Callback to back up and restore the training state
+#'
+#' @details
+#' `BackupAndRestore` callback is intended to recover training from an
+#' interruption that has happened in the middle of a `fit(model)` execution, by
+#' backing up the training states in a temporary checkpoint file (with the help
+#' of a `tf.train.CheckpointManager`), at the end of each epoch. Each backup
+#' overwrites the previously written checkpoint file, so at any given time there
+#' is at most one such checkpoint file for backup/restoring purpose.
+#'
+#' If training restarts before completion, the training state (which includes the
+#' `Model` weights and epoch number) is restored to the most recently saved state
+#' at the beginning of a new `fit()` run. At the completion of a `fit()`
+#' run, the temporary checkpoint file is deleted.
+#'
+#' Note that the user is responsible to bring jobs back after the interruption.
+#' This callback is important for the backup and restore mechanism for fault
+#' tolerance purpose, and the model to be restored from an previous checkpoint is
+#' expected to be the same as the one used to back up. If user changes arguments
+#' passed to compile or fit, the checkpoint saved for fault tolerance can become
+#' invalid.
+#'
+#' Note:
+#'
+#' 1. This callback is not compatible with eager execution disabled.
+#'
+#' 2. A checkpoint is saved at the end of each epoch. After restoring,
+#' `fit()` redoes any partial work during the unfinished epoch in which the
+#' training got restarted (so the work done before the interruption doesn't
+#' affect the final model state).
+#'
+#' 3. This works for both single worker and multi-worker modes. When `fit()`
+#' is used with `tf.distribute`, it supports `tf.distribute.MirroredStrategy`,
+#' `tf.distribute.MultiWorkerMirroredStrategy`, `tf.distribute.TPUStrategy`, and
+#' `tf.distribute.experimental.ParameterServerStrategy`.
+#'
+#' @param backup_dir String, path to store the checkpoint.
+#' e.g. `backup_dir = normalizePath('./backup')`
+#' This is the directory in which the system stores temporary files to
+#' recover the model from jobs terminated unexpectedly. The directory
+#' cannot be reused elsewhere to store other files, e.g. by
+#' `BackupAndRestore` callback of another training, or by another callback
+#' (`ModelCheckpoint`) of the same training.
+#' @param ... For backwards and forwards compatibility
+#'
+#' @seealso
+#'   +  <https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/BackupAndRestore>
+#'
+#' @export
+callback_backup_and_restore <-
+function(backup_dir) {
+  args <- capture_args(match.call(), NULL)
+  require_tf_version("2.8", "callback_backup_and_restore")
+  do.call(keras$callbacks$BackupAndRestore, args)
+}
+
+
 #' Stop training when a monitored quantity has stopped improving.
 #'
 #' @inheritParams callback_model_checkpoint
@@ -750,3 +807,5 @@ normalize_callbacks <- function(callbacks) {
 }
 
 empty_fun <- function(batch, logs = NULL) {}
+
+
diff --git a/keras.Rproj b/keras.Rproj
@@ -17,5 +17,6 @@ StripTrailingWhitespace: Yes
 
 BuildType: Package
 PackageUseDevtools: Yes
+PackageCleanBeforeInstall: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
 PackageRoxygenize: rd,collate,namespace
diff --git a/man/callback_backup_and_restore.Rd b/man/callback_backup_and_restore.Rd
diff --git a/tests/testthat/test-callbacks.R b/tests/testthat/test-callbacks.R
@@ -32,6 +32,10 @@ if (tensorflow::tf_version() <= "2.1")
 
 
 test_callback("model_checkpoint", callback_model_checkpoint(tempfile(fileext = ".h5")), h5py = TRUE)
+
+if(tf_version() >= "2.8")
+  test_callback("backup_and_restore", callback_backup_and_restore(tempfile()))
+
 test_callback("learning_rate_scheduler", callback_learning_rate_scheduler(schedule = function (index, ...) {
   0.1
 }))
diff --git a/tools/make-wrapper.R b/tools/make-wrapper.R
@@ -175,4 +175,5 @@ print.r_py_wrapper2 <- function(x, ...) {
 #
 # new_wrapper("learning_rate_schedule", keras$optimizers$schedules$PolynomialDecay)
 
-new_wrapper("regularizer", keras$regularizers$OrthogonalRegularizer)
+# new_wrapper("regularizer", keras$regularizers$OrthogonalRegularizer)
+new_wrapper("callback", keras$callbacks$BackupAndRestore)

Original file line number	Diff line number	Diff line change
`@@ -175,4 +175,5 @@ print.r_py_wrapper2 <- function(x, ...) {`
`175`	`175`	`#`
`176`	`176`	`# new_wrapper("learning_rate_schedule", keras$optimizers$schedules$PolynomialDecay)`
`177`	`177`
`178`		`-new_wrapper("regularizer", keras$regularizers$OrthogonalRegularizer)`
	`178`	`+# new_wrapper("regularizer", keras$regularizers$OrthogonalRegularizer)`
	`179`	`+new_wrapper("callback", keras$callbacks$BackupAndRestore)`