rstudio
diff --git a/‎NAMESPACE‎
Lines changed: 4 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/layer-methods.R‎
Lines changed: 18 additions & 1 deletion b/‎R/layer-methods.R‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎R/layer-text_vectorization.R‎
Lines changed: 132 additions & 0 deletions b/‎R/layer-text_vectorization.R‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎man/adapt.Rd‎
Lines changed: 22 additions & 0 deletions b/‎man/adapt.Rd‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎man/get_vocabulary.Rd‎
Lines changed: 17 additions & 0 deletions b/‎man/get_vocabulary.Rd‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎man/layer_text_vectorization.Rd‎
Lines changed: 76 additions & 0 deletions b/‎man/layer_text_vectorization.Rd‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎man/set_vocabulary.Rd‎
Lines changed: 35 additions & 0 deletions b/‎man/set_vocabulary.Rd‎
Lines changed: 35 additions & 0 deletions
@@ -31,6 +31,7 @@ export(activation_softmax)
 export(activation_softplus)
 export(activation_softsign)
 export(activation_tanh)
+export(adapt)
 export(application_densenet)
 export(application_densenet121)
 export(application_densenet169)
@@ -106,6 +107,7 @@ export(get_layer)
 export(get_output_at)
 export(get_output_mask_at)
 export(get_output_shape_at)
+export(get_vocabulary)
 export(get_weights)
 export(hdf5_matrix)
 export(image_array_resize)
@@ -355,6 +357,7 @@ export(layer_spatial_dropout_1d)
 export(layer_spatial_dropout_2d)
 export(layer_spatial_dropout_3d)
 export(layer_subtract)
+export(layer_text_vectorization)
 export(layer_upsampling_1d)
 export(layer_upsampling_2d)
 export(layer_upsampling_3d)
@@ -438,6 +441,7 @@ export(save_model_weights_tf)
 export(save_text_tokenizer)
 export(sequences_to_matrix)
 export(serialize_model)
+export(set_vocabulary)
 export(set_weights)
 export(shape)
 export(skipgrams)
 
@@ -156,7 +156,24 @@ as_node_index <- function(node_index) {
   as.integer(node_index-1)
 }
 
-
+#' Fits the state of the preprocessing layer to the data being passed.
+#' 
+#' @param object Preprocessing layer object
+#' @param data The data to train on. It can be passed either as a tf.data Dataset,
+#'  or as an R array.
+#' @param reset_state Optional argument specifying whether to clear the state of
+#'  the layer at the start of the call to `adapt`, or whether to start from
+#'  the existing state. Subclasses may choose to throw if `reset_state` is set
+#'  to `FALSE`. `NULL` mean layer's default.
+#'  
+#' @export
+adapt <- function(object, data, reset_state = NULL) {
+  # layers can implement adapt with different default reset_state
+  if (is.null(reset_state))
+    object$adapt(data)
+  else
+    object$adapt(data, reset_state)
+}
 
 
 
 
@@ -0,0 +1,132 @@
+#' Text vectorization layer
+#' 
+#' This layer has basic options for managing text in a Keras model. It
+#' transforms a batch of strings (one sample = one string) into either a list of
+#' token indices (one sample = 1D tensor of integer token indices) or a dense
+#' representation (one sample = 1D tensor of float values representing data about
+#' the sample's tokens).
+#' 
+#' The processing of each sample contains the following steps:
+#' 
+#' 1) standardize each sample (usually lowercasing + punctuation stripping)
+#' 2) split each sample into substrings (usually words)
+#' 3) recombine substrings into tokens (usually ngrams)
+#' 4) index tokens (associate a unique int value with each token)
+#' 5) transform each sample using this index, either into a vector of ints or
+#'    a dense float vector.
+#'    
+#' @inheritParams layer_dense
+#' @param max_tokens The maximum size of the vocabulary for this layer. If `NULL`,
+#'  there is no cap on the size of the vocabulary.
+#' @param standardize Optional specification for standardization to apply to the
+#'  input text. Values can be `NULL` (no standardization),
+#'  `"lower_and_strip_punctuation"` (lowercase and remove punctuation) or a
+#'  Callable. Default is `"lower_and_strip_punctuation"`.
+#' @param split Optional specification for splitting the input text. Values can be
+#'  `NULL` (no splitting), `"split_on_whitespace"` (split on ASCII whitespace), or 
+#'  a Callable. Default is `"split_on_whitespace"`.
+#' @param ngrams Optional specification for ngrams to create from the possibly-split
+#'  input text. Values can be `NULL`, an integer or a list of integers; passing
+#'  an integer will create ngrams up to that integer, and passing a list of
+#'  integers will create ngrams for the specified values in the list. Passing
+#'  `NULL` means that no ngrams will be created.
+#' @param output_mode Optional specification for the output of the layer. Values can
+#'  be `"int"`, `"binary"`, `"count"` or `"tfidf"`, which control the outputs as follows:
+#'  * "int": Outputs integer indices, one integer index per split string token.
+#'  * "binary": Outputs a single int array per batch, of either vocab_size or
+#'   `max_tokens` size, containing 1s in all elements where the token mapped
+#'   to that index exists at least once in the batch item.
+#'  * "count": As "binary", but the int array contains a count of the number of
+#'   times the token at that index appeared in the batch item.
+#'  * "tfidf": As "binary", but the TF-IDF algorithm is applied to find the value
+#'   in each token slot.
+#' @param output_sequence_length Only valid in "int" mode. If set, the output will have
+#'  its time dimension padded or truncated to exactly `output_sequence_length`
+#'  values, resulting in a tensor of shape (batch_size, output_sequence_length) regardless 
+#'  of how many tokens resulted from the splitting step. Defaults to `NULL`.
+#' @param pad_to_max_tokens Only valid in "binary", "count", and "tfidf" modes. If `TRUE`,
+#'  the output will have its feature axis padded to `max_tokens` even if the
+#'  number of unique tokens in the vocabulary is less than max_tokens,
+#'  resulting in a tensor of shape (batch_size, max_tokens) regardless of
+#'  vocabulary size. Defaults to `TRUE`.
+#' @param ... Not used.
+#'  
+#' @export
+layer_text_vectorization <- function(object, max_tokens = NULL, standardize = "lower_and_strip_punctuation",
+                                     split = "whitespace", ngrams = NULL, 
+                                     output_mode = c("int", "binary", "count", "tfidf"),
+                                     output_sequence_length = NULL, pad_to_max_tokens = TRUE,
+                                     ...) {
+  
+  if (tensorflow::tf_version() < "2.1")
+    stop("Text Vectorization requires TensorFlow version >= 2.1", call. = FALSE)
+  
+  if (length(ngrams) > 1)
+    ngrams <- as_integer_tuple(ngrams)
+  else
+    ngrams <- as_nullable_integer(ngrams)
+  
+  output_mode <- match.arg(output_mode)  
+  
+  args <- list(
+    max_tokens = as_nullable_integer(max_tokens),
+    ngrams = ngrams,
+    output_mode = output_mode,
+    output_sequence_length = as_nullable_integer(output_sequence_length),
+    pad_to_max_tokens = pad_to_max_tokens
+  )
+  
+  # see https://github.com/tensorflow/tensorflow/pull/34420
+  if (!identical(standardize, "lower_and_strip_punctuation"))
+    args$standardize <- standardize
+  
+  if (!identical(split, "whitespace"))
+    args$split <- split
+  
+  create_layer(resolve_text_vectorization_module(), object, args)
+}
+
+#' Get the vocabulary for text vectorization layers
+#' 
+#' @param object a text vectorization layer
+#'
+#' @seealso [set_vocabulary()]
+#' @export
+get_vocabulary <- function(object) {
+  object$get_vocabulary()
+}
+
+#' Sets vocabulary (and optionally document frequency) data for the layer
+#' 
+#' This method sets the vocabulary and DF data for this layer directly, instead
+#' of analyzing a dataset through [adapt()]. It should be used whenever the `vocab`
+#' (and optionally document frequency) information is already known. If
+#' vocabulary data is already present in the layer, this method will either
+#' replace it, if `append` is set to `FALSE`, or append to it (if 'append' is set
+#' to `TRUE`)
+#' 
+#' @inheritParams get_vocabulary
+#' @param vocab An array of string tokens.
+#' @param df_data An array of document frequency data. Only necessary if the layer
+#'  output_mode is "tfidf".
+#' @param oov_df_value The document frequency of the OOV token. Only necessary if
+#'  output_mode is "tfidf". OOV data is optional when appending additional
+#'  data in "tfidf" mode; if an OOV value is supplied it will overwrite the
+#'  existing OOV value.
+#' @param append Whether to overwrite or append any existing vocabulary data.
+#' 
+#' @seealso [get_vocabulary()]
+#' 
+#' @export
+set_vocabulary <- function(object, vocab, df_data = NULL, oov_df_value = FALSE,
+                           append = FALSE) {
+  object$set_vocabulary(vocab, df_data, oov_df_value, append)
+}
+
+
+resolve_text_vectorization_module <- function() {
+  if (keras_version() >= "2.2.4")
+    keras$layers$experimental$preprocessing$TextVectorization
+  else
+    stop("Keras >= 2.2.4 is required", call. = FALSE)
+}