Merge pull request #958 from ifrit98/master

dfalbel · web-flow · commit 687d341065fa · 2020-01-13T14:02:23.000-03:00
expose renorm, renorm_clipping to layer_batch_normalization
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -43,5 +43,5 @@ Suggests:
     jpeg
 SystemRequirements: Keras >= 2.0 (https://keras.io)
 Roxygen: list(markdown = TRUE, r6 = FALSE)
-RoxygenNote: 7.0.1
+RoxygenNote: 7.0.2
 VignetteBuilder: knitr
diff --git a/R/layers-normalization.R b/R/layers-normalization.R
@@ -25,7 +25,41 @@
 #' @param gamma_regularizer Optional regularizer for the gamma weight.
 #' @param beta_constraint Optional constraint for the beta weight.
 #' @param gamma_constraint Optional constraint for the gamma weight.
-#'   
+#' @param renorm Whether to use Batch Renormalization
+#'   (https://arxiv.org/abs/1702.03275). This adds extra variables during
+#'   training. The inference is the same for either value of this parameter.
+#' @param renorm_clipping A named list or dictionary that may map keys `rmax`,
+#'   `rmin`, `dmax` to scalar Tensors used to clip the renorm correction. The
+#'   correction `(r, d)` is used as `corrected_value = normalized_value * r + d`,
+#'   with `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing `rmax`,
+#'   `rmin`, `dmax` are set to `Inf`, `0`, `Inf`, `respectively`.
+#' @param renorm_momentum Momentum used to update the moving means and standard
+#'   deviations with renorm. Unlike momentum, this affects training and should
+#'   be neither too small (which would add noise) nor too large (which would
+#'   give stale estimates). Note that momentum is still applied to get the means
+#'   and variances for inference.
+#' @param fused `TRUE`, use a faster, fused implementation, or raise a ValueError
+#'   if the fused implementation cannot be used. If `NULL`, use the faster
+#'   implementation if possible. If `FALSE`, do not use the fused implementation.
+#' @param virtual_batch_size An integer. By default, virtual_batch_size is `NULL`,
+#'   which means batch normalization is performed across the whole batch.
+#'   When virtual_batch_size is not `NULL`, instead perform "Ghost Batch 
+#'   Normalization", which creates virtual sub-batches which are each normalized
+#'   separately (with shared gamma, beta, and moving statistics). Must divide
+#'   the actual `batch size` during execution.
+#' @param adjustment A function taking the Tensor containing the (dynamic) shape
+#'   of the input tensor and returning a pair `(scale, bias)` to apply to the
+#'   normalized values `(before gamma and beta)`, only during training.
+#'   For example, if `axis==-1`, 
+#'   \code{adjustment <- function(shape) {
+#'     tuple(tf$random$uniform(shape[-1:NULL, style = "python"], 0.93, 1.07),
+#'           tf$random$uniform(shape[-1:NULL, style = "python"], -0.1, 0.1))
+#'    }}
+#'   will scale the normalized value
+#'   by up to 7% up or down, then shift the result by up to 0.1 (with 
+#'   independent scaling and bias for each feature but shared across all examples),
+#'   and finally apply gamma and/or beta. If `NULL`, no adjustment is applied.
+#'   Cannot be specified if virtual_batch_size is specified.
 #' @section Input shape: Arbitrary. Use the keyword argument `input_shape` (list
 #'   of integers, does not include the samples axis) when using this layer as
 #'   the first layer in a model.
@@ -36,13 +70,17 @@
 #' - [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
 #'   
 #' @export
-layer_batch_normalization <- function(object, axis = -1L, momentum = 0.99, epsilon = 0.001, center = TRUE, scale = TRUE, 
-                                      beta_initializer = "zeros", gamma_initializer = "ones", 
-                                      moving_mean_initializer = "zeros", moving_variance_initializer = "ones", 
-                                      beta_regularizer = NULL, gamma_regularizer = NULL, 
-                                      beta_constraint = NULL, gamma_constraint = NULL, 
-                                      input_shape = NULL,  batch_input_shape = NULL, batch_size = NULL, 
-                                      dtype = NULL, name = NULL, trainable = NULL, weights = NULL) {
+layer_batch_normalization <- function(object, axis = -1L, momentum = 0.99, epsilon = 0.001, center = TRUE, scale = TRUE,
+                                      beta_initializer = "zeros", gamma_initializer = "ones",
+                                      moving_mean_initializer = "zeros", moving_variance_initializer = "ones",
+                                      beta_regularizer = NULL, gamma_regularizer = NULL, beta_constraint = NULL, 
+                                      gamma_constraint = NULL, renorm = FALSE, renorm_clipping = NULL, 
+                                      renorm_momentum = 0.99, fused = NULL, virtual_batch_size = NULL,
+                                      adjustment = NULL, input_shape = NULL,  batch_input_shape = NULL, 
+                                      batch_size = NULL, dtype = NULL, name = NULL, trainable = NULL, weights = NULL) {
+  
+  stopifnot(is.null(adjustment) || is.function(adjustment))
+
   create_layer(keras$layers$BatchNormalization, object, list(
     axis = as.integer(axis),
     momentum = momentum,
@@ -57,12 +95,18 @@ layer_batch_normalization <- function(object, axis = -1L, momentum = 0.99, epsil
     gamma_regularizer = gamma_regularizer,
     beta_constraint = beta_constraint,
     gamma_constraint = gamma_constraint,
+    renorm = renorm,
+    renorm_clipping = renorm_clipping,
+    renorm_momentum = renorm_momentum,
+    fused = fused,
     input_shape = normalize_shape(input_shape),
     batch_input_shape = normalize_shape(batch_input_shape),
     batch_size = as_nullable_integer(batch_size),
     dtype = dtype,
     name = name,
     trainable = trainable,
+    virtual_batch_size = as_nullable_integer(virtual_batch_size),
+    adjustment = adjustment,
     weights = weights
   ))
-}
+}
diff --git a/man/layer_batch_normalization.Rd b/man/layer_batch_normalization.Rd