Updating docstrings to explicitly state SparseTensor and RaggedTensor support.

iindyk · tfx-copybara · commit 4c0f0e21ba21 · 2022-11-11T10:29:57.000-08:00
PiperOrigin-RevId: 487858729
diff --git a/tensorflow_transform/analyzers.py b/tensorflow_transform/analyzers.py
@@ -427,13 +427,13 @@ def min(  # pylint: disable=redefined-builtin
     x: common_types.TensorType,
     reduce_instance_dims: bool = True,
     name: Optional[str] = None) -> tf.Tensor:
-  """Computes the minimum of the values of a `Tensor` over the whole dataset.
+  """Computes the minimum of the values of `x` over the whole dataset.
 
   In the case of a `CompositeTensor` missing values will be used in return
   value: for float, NaN is used and for other dtypes the max is used.
 
   Args:
-    x: A `Tensor` or `CompositeTensor`.
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
     reduce_instance_dims: By default collapses the batch and instance dimensions
       to arrive at a single scalar output. If False, only collapses the batch
       dimension and outputs a `Tensor` of the same shape as the input.
@@ -454,13 +454,13 @@ def max(  # pylint: disable=redefined-builtin
     x: common_types.TensorType,
     reduce_instance_dims: bool = True,
     name: Optional[str] = None) -> tf.Tensor:
-  """Computes the maximum of the values of a `Tensor` over the whole dataset.
+  """Computes the maximum of the values of `x` over the whole dataset.
 
   In the case of a `CompositeTensor` missing values will be used in return
   value: for float, NaN is used and for other dtypes the min is used.
 
   Args:
-    x: A `Tensor` or `CompositeTensor`.
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
     reduce_instance_dims: By default collapses the batch and instance dimensions
       to arrive at a single scalar output. If False, only collapses the batch
       dimension and outputs a vector of the same shape as the input.
@@ -478,14 +478,14 @@ def max(  # pylint: disable=redefined-builtin
 def _min_and_max(x: common_types.TensorType,
                  reduce_instance_dims: bool = True,
                  name: Optional[str] = None) -> Tuple[tf.Tensor, tf.Tensor]:
-  """Computes the min and max of the values of a `Tensor` or `CompositeTensor`.
+  """Computes the min and max of the values of `x`.
 
   In the case of a `CompositeTensor` missing values will be used in return
   value:
   for float, NaN is used and for other dtypes the min is used.
 
   Args:
-    x: A `Tensor` or `CompositeTensor`.
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
     reduce_instance_dims: By default collapses the batch and instance dimensions
       to arrive at a single scalar output. If False, only collapses the batch
       dimension and outputs a vector of the same shape as the input.
@@ -530,7 +530,7 @@ def _min_and_max_per_key(
     key_vocabulary_filename: Optional[str] = None,
     name: Optional[str] = None
 ) -> Union[Tuple[tf.Tensor, tf.Tensor, tf.Tensor], tf.Tensor]:
-  """Computes the min and max of the values of a `Tensor` or `CompositeTensor`.
+  """Computes the min and max of the values of `x`.
 
   In the case of a `CompositeTensor` missing values will be used in return
   value: for float, NaN is used and for other dtypes the min is used.
@@ -541,10 +541,10 @@ def _min_and_max_per_key(
   available in a future version.
 
   Args:
-    x: A `Tensor` or `CompositeTensor`.
-    key: A Tensor or `CompositeTensor` of dtype tf.string.  If `x` is a
-      `CompositeTensor`, `key` must exactly match `x` in everything except
-      values.
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
+    key: A `Tensor`, `SparseTensor`, or `RaggedTensor` of dtype tf.string.  If
+      `x` is a `CompositeTensor`, `key` must exactly match `x` in everything
+      except values.
     reduce_instance_dims: By default collapses the batch and instance dimensions
       to arrive at a single scalar output. If False, only collapses the batch
       dimension and outputs a vector of the same shape as the input. The False
@@ -628,9 +628,9 @@ def sum(  # pylint: disable=redefined-builtin
   """Computes the sum of the values of a `Tensor` over the whole dataset.
 
   Args:
-    x: A `Tensor` or `CompositeTensor`. Its type must be floating point
-        (float{16|32|64}),integral (int{8|16|32|64}), or
-        unsigned integral (uint{8|16})
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
+        point (float{16|32|64}),integral (int{8|16|32|64}), or unsigned
+        integral (uint{8|16}).
     reduce_instance_dims: By default collapses the batch and instance dimensions
         to arrive at a single scalar output. If False, only collapses the batch
         dimension and outputs a vector of the same shape as the input.
@@ -694,7 +694,7 @@ def histogram(x: common_types.TensorType,
                            zip(classes, probabilities)))
 
   Args:
-    x: A `Tensor` or `CompositeTensor`.
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
     boundaries: (Optional) A `Tensor` or `int` used to build the histogram;
       ignored if `categorical` is True. If possible, provide boundaries as
       multiple sorted values.  Default to 10 intervals over the 0-1 range, or
@@ -746,7 +746,7 @@ def size(x: common_types.TensorType,
   """Computes the total size of instances in a `Tensor` over the whole dataset.
 
   Args:
-    x: A `Tensor` or `CompositeTensor`.
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
     reduce_instance_dims: By default collapses the batch and instance dimensions
       to arrive at a single scalar output. If False, only collapses the batch
       dimension and outputs a vector of the same shape as the input.
@@ -774,7 +774,8 @@ def count_per_key(key: common_types.TensorType,
   """Computes the count of each element of a `Tensor`.
 
   Args:
-    key: A Tensor or `CompositeTensor` of dtype tf.string or tf.int.
+    key: A `Tensor`, `SparseTensor`, or `RaggedTensor` of dtype tf.string or
+      tf.int.
     key_vocabulary_filename: (Optional) The file name for the key-output mapping
       file. If None and key are provided, this combiner assumes the keys fit in
       memory and will not store the result in a file. If empty string, a file
@@ -824,8 +825,8 @@ def mean(x: common_types.TensorType,
   """Computes the mean of the values of a `Tensor` over the whole dataset.
 
   Args:
-    x: A `Tensor` or `CompositeTensor`. Its type must be floating point
-        (float{16|32|64}), or integral ([u]int{8|16|32|64}).
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
+        point (float{16|32|64}), or integral ([u]int{8|16|32|64}).
     reduce_instance_dims: By default collapses the batch and instance dimensions
         to arrive at a single scalar output. If False, only collapses the batch
         dimension and outputs a vector of the same shape as the input.
@@ -855,8 +856,8 @@ def var(x: common_types.TensorType,
   (x - mean(x))**2 / length(x).
 
   Args:
-    x: `Tensor` or `CompositeTensor`. Its type must be floating point
-        (float{16|32|64}), or integral ([u]int{8|16|32|64}).
+    x: `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
+        point (float{16|32|64}), or integral ([u]int{8|16|32|64}).
     reduce_instance_dims: By default collapses the batch and instance dimensions
         to arrive at a single scalar output. If False, only collapses the batch
         dimension and outputs a vector of the same shape as the input.
@@ -930,8 +931,8 @@ def tukey_location(x: common_types.TensorType,
   Mathematics, vol. 2012, 2012. doi:10.5402/2012/980153
 
   Args:
-    x: A `Tensor` or `CompositeTensor`. Its type must be floating point
-        (float{16|32|64}), or integral ([u]int{8|16|32|64}).
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
+        point (float{16|32|64}), or integral ([u]int{8|16|32|64}).
     reduce_instance_dims: By default collapses the batch and instance dimensions
         to arrive at a single scalar output. If False, only collapses the batch
         dimension and outputs a vector of the same shape as the input.
@@ -968,8 +969,8 @@ def tukey_scale(x: common_types.TensorType,
 
 
   Args:
-    x: A `Tensor` or `CompositeTensor`. Its type must be floating point
-        (float{16|32|64}), or integral ([u]int{8|16|32|64}).
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
+        point (float{16|32|64}), or integral ([u]int{8|16|32|64}).
     reduce_instance_dims: By default collapses the batch and instance dimensions
         to arrive at a single scalar output. If False, only collapses the batch
         dimension and outputs a vector of the same shape as the input.
@@ -1005,8 +1006,8 @@ def tukey_h_params(x: common_types.TensorType,
   Mathematics, vol. 2012, 2012. doi:10.5402/2012/980153
 
   Args:
-    x: A `Tensor` or `CompositeTensor`. Its type must be floating point
-        (float{16|32|64}), or integral ([u]int{8|16|32|64}).
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
+        point (float{16|32|64}), or integral ([u]int{8|16|32|64}).
     reduce_instance_dims: By default collapses the batch and instance dimensions
         to arrive at a single scalar output. If False, only collapses the batch
         dimension and outputs a vector of the same shape as the input.
@@ -1075,10 +1076,10 @@ def _mean_and_var_per_key(
   """`mean_and_var` by group, specified by key.
 
   Args:
-    x: A `Tensor` or `CompositeTensor`.
-    key: A Tensor or `CompositeTensor` of dtype tf.string.  If `x` is
-      a `CompositeTensor`, `key` must exactly match `x` in everything except
-      values.
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
+    key: A `Tensor`, `SparseTensor`, or `RaggedTensor` of dtype tf.string.  If
+      `x` is a `CompositeTensor`, `key` must exactly match `x` in everything
+      except values.
     reduce_instance_dims: (Optional) By default collapses the batch and instance
         dimensions to arrive at a single scalar output. The False case is not
         currently supported for _mean_and_var_per_key.
@@ -1726,11 +1727,11 @@ def vocabulary(
     file_format: common_types
     .VocabularyFileFormatType = DEFAULT_VOCABULARY_FILE_FORMAT,
     name: Optional[str] = None) -> common_types.TemporaryAnalyzerOutputType:
-  r"""Computes the unique values of a `Tensor` over the whole dataset.
+  r"""Computes the unique values of `x` over the whole dataset.
 
-  Computes The unique values taken by `x`, which can be a `Tensor` or
-  `CompositeTensor` of any size.  The unique values will be aggregated over all
-  dimensions of `x` and all instances.
+  Computes The unique values taken by `x`, which can be a `Tensor`,
+  `SparseTensor`, or `RaggedTensor` of any size.  The unique values will be
+  aggregated over all dimensions of `x` and all instances.
 
   In case `file_format` is 'text' and one of the tokens contains the '\n' or
   '\r' characters or is empty it will be discarded.
@@ -1774,9 +1775,9 @@ def vocabulary(
   within each vocabulary entry (b/117796748).
 
   Args:
-    x: A categorical/discrete input `Tensor` or `CompositeTensor` with dtype
-      tf.string or tf.int[8|16|32|64]. The inputs should generally be unique per
-      row (i.e. a bag of words/ngrams representation).
+    x: A categorical/discrete input `Tensor`, `SparseTensor`, or `RaggedTensor`
+      with dtype tf.string or tf.int[8|16|32|64]. The inputs should generally be
+      unique per row (i.e. a bag of words/ngrams representation).
     top_k: Limit the generated vocabulary to the first `top_k` elements. If set
       to None, the full vocabulary is generated.
     frequency_threshold: Limit the generated vocabulary only to elements whose
diff --git a/tensorflow_transform/beam/impl.py b/tensorflow_transform/beam/impl.py
@@ -385,11 +385,11 @@ def setup(self):
       ]
 
   def process(self, batch, saved_model_dir):
-    """Runs the given graph to realize the output `Tensor` or `SparseTensor`s.
+    """Runs the given graph to realize the outputs.
 
     Runs the graph in a TF session for computing the output values of the
-    `Tensor` or `SparseTensor`s, given an input row of data (input `Tensor` or
-    `SparseTensor`s).
+    `Tensor`s, `SparseTensor`s, or `RaggedTensor`s, given an input row of data
+    (input `Tensor`s, `SparseTensor`s, or `RaggedTensor`s).
 
     Args:
       batch: the batch of elements being processed by the DoFn
@@ -970,7 +970,7 @@ def __init__(self, preprocessing_fn, pipeline=None):
 
     Args:
       preprocessing_fn: A function that accepts and returns a dictionary from
-        strings to `Tensor` or `SparseTensor`s.
+        strings to `Tensor`s, `SparseTensor`s, or `RaggedTensor`s.
       pipeline: (Optional) a beam Pipeline.
     """
     self._preprocessing_fn = preprocessing_fn
@@ -1326,7 +1326,7 @@ def __init__(self, preprocessing_fn, output_record_batches=False):
 
     Args:
       preprocessing_fn: A function that accepts and returns a dictionary from
-          strings to `Tensor` or `SparseTensor`s.
+          strings to `Tensor`s, `SparseTensor`s, or `RaggedTensor`s.
       output_record_batches: (Optional) A bool. If `True`,
           `AnalyzeAndTransformDataset` outputs `pyarrow.RecordBatch`es;
           otherwise, outputs instance dicts.
diff --git a/tensorflow_transform/experimental/analyzers.py b/tensorflow_transform/experimental/analyzers.py
@@ -114,7 +114,8 @@ def _apply_analyzer(ptransform: Union[_BeamPTransform,
   Args:
     ptransform: A class inheriting from analyzer_nodes.AnalyzerDef or
       CacheablePTransformAnalyzer that should be applied.
-    *tensor_inputs: A list of input `Tensor`s or `CompositeTensor`s.
+    *tensor_inputs: A list of input `Tensor`s, `SparseTensor`s, or
+      `RaggedTensor`s.
     **analyzer_def_kwargs: KW arguments to use when constructing
       analyzer_def_cls.
 
@@ -315,9 +316,9 @@ def approximate_vocabulary(
     name: Optional[str] = None) -> common_types.TemporaryAnalyzerOutputType:
   r"""Computes the unique values of a `Tensor` over the whole dataset.
 
-  Approximately computes the unique values taken by `x`, which can be a `Tensor`
-  or `CompositeTensor` of any size.  The unique values will be aggregated over
-  all dimensions of `x` and all instances.
+  Approximately computes the unique values taken by `x`, which can be a
+  `Tensor`, `SparseTensor`, or `RaggedTensor` of any size.  The unique values
+  will be aggregated over all dimensions of `x` and all instances.
 
   This analyzer provides an approximate alternative to `tft.vocabulary` that can
   be more efficient with smaller `top_k` and/or smaller number of unique
@@ -360,8 +361,8 @@ def approximate_vocabulary(
   if `x` is numerical dtype (e.g. [('3', 5), ('2', 3), ('111', 3)]).
 
   Args:
-    x: A categorical/discrete input `Tensor` or `CompositeTensor` with dtype
-      tf.string or tf.int[8|16|32|64].
+    x: A categorical/discrete input `Tensor`, `SparseTensor`, or `RaggedTensor`
+      with dtype tf.string or tf.int[8|16|32|64].
     top_k: Limit the generated vocabulary to the first `top_k` elements. Note
       that if `top_k` is larger than the number of unique elements in `x`, then
       the result will be exact.
@@ -526,7 +527,7 @@ def _get_approximate_vocabulary_analyzer_inputs(
   """Helper for constructing approximate vocabulary inputs from tensors.
 
   Args:
-    x: `Tensor` or `CompositeTensor` to compute vocabulary over.
+    x: `Tensor`, `SparseTensor`, or `RaggedTensor` to compute vocabulary over.
     file_format: The format of the resulting vocabulary file.
       'tfrecord_gzip' requires tensorflow>=2.4.
     weights: Optional `Tensor` of weights.
diff --git a/tensorflow_transform/experimental/mappers.py b/tensorflow_transform/experimental/mappers.py
@@ -51,7 +51,8 @@ def compute_and_apply_approximate_vocabulary(
   """Generates an approximate vocabulary for `x` and maps it to an integer.
 
   Args:
-    x: A `Tensor` or `CompositeTensor` of type tf.string or tf.int[8|16|32|64].
+    x: A `Tensor`, `SparseTensor`, or `RaggedTensor` of type tf.string or
+      tf.int[8|16|32|64].
     default_value: The value to use for out-of-vocabulary values, unless
       'num_oov_buckets' is greater than zero.
     top_k: Limit the generated vocabulary to the first `top_k` elements. If set
@@ -73,13 +74,14 @@ def compute_and_apply_approximate_vocabulary(
     name: (Optional) A name for this operation.
 
   Returns:
-    A `Tensor` or `CompositeTensor` where each string value is mapped to an
-    integer. Each unique string value that appears in the vocabulary
-    is mapped to a different integer and integers are consecutive starting from
-    zero. String value not in the vocabulary is assigned default_value.
-    Alternatively, if num_oov_buckets is specified, out of vocabulary strings
-    are hashed to values in [vocab_size, vocab_size + num_oov_buckets) for an
-    overall range of [0, vocab_size + num_oov_buckets).
+    A `Tensor`, `SparseTensor`, or `RaggedTensor` where each string value is
+    mapped to an integer. Each unique string value that appears in the
+    vocabulary is mapped to a different integer and integers are consecutive
+    starting from zero. String value not in the vocabulary is assigned
+    `default_value`. Alternatively, if `num_oov_buckets` is specified, out of
+    vocabulary strings are hashed to values in
+    [vocab_size, vocab_size + num_oov_buckets) for an overall range of
+    [0, vocab_size + num_oov_buckets).
 
   Raises:
     ValueError: If `top_k` is negative.
diff --git a/tensorflow_transform/mappers.py b/tensorflow_transform/mappers.py
diff --git a/tensorflow_transform/output_wrapper.py b/tensorflow_transform/output_wrapper.py