Skip to content

Commit 4c0f0e2

Browse files
iindyktfx-copybara
authored andcommitted
Updating docstrings to explicitly state SparseTensor and RaggedTensor support.
PiperOrigin-RevId: 487858729
1 parent e245f20 commit 4c0f0e2

File tree

6 files changed

+183
-171
lines changed

6 files changed

+183
-171
lines changed

tensorflow_transform/analyzers.py

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -427,13 +427,13 @@ def min( # pylint: disable=redefined-builtin
427427
x: common_types.TensorType,
428428
reduce_instance_dims: bool = True,
429429
name: Optional[str] = None) -> tf.Tensor:
430-
"""Computes the minimum of the values of a `Tensor` over the whole dataset.
430+
"""Computes the minimum of the values of `x` over the whole dataset.
431431
432432
In the case of a `CompositeTensor` missing values will be used in return
433433
value: for float, NaN is used and for other dtypes the max is used.
434434
435435
Args:
436-
x: A `Tensor` or `CompositeTensor`.
436+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
437437
reduce_instance_dims: By default collapses the batch and instance dimensions
438438
to arrive at a single scalar output. If False, only collapses the batch
439439
dimension and outputs a `Tensor` of the same shape as the input.
@@ -454,13 +454,13 @@ def max( # pylint: disable=redefined-builtin
454454
x: common_types.TensorType,
455455
reduce_instance_dims: bool = True,
456456
name: Optional[str] = None) -> tf.Tensor:
457-
"""Computes the maximum of the values of a `Tensor` over the whole dataset.
457+
"""Computes the maximum of the values of `x` over the whole dataset.
458458
459459
In the case of a `CompositeTensor` missing values will be used in return
460460
value: for float, NaN is used and for other dtypes the min is used.
461461
462462
Args:
463-
x: A `Tensor` or `CompositeTensor`.
463+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
464464
reduce_instance_dims: By default collapses the batch and instance dimensions
465465
to arrive at a single scalar output. If False, only collapses the batch
466466
dimension and outputs a vector of the same shape as the input.
@@ -478,14 +478,14 @@ def max( # pylint: disable=redefined-builtin
478478
def _min_and_max(x: common_types.TensorType,
479479
reduce_instance_dims: bool = True,
480480
name: Optional[str] = None) -> Tuple[tf.Tensor, tf.Tensor]:
481-
"""Computes the min and max of the values of a `Tensor` or `CompositeTensor`.
481+
"""Computes the min and max of the values of `x`.
482482
483483
In the case of a `CompositeTensor` missing values will be used in return
484484
value:
485485
for float, NaN is used and for other dtypes the min is used.
486486
487487
Args:
488-
x: A `Tensor` or `CompositeTensor`.
488+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
489489
reduce_instance_dims: By default collapses the batch and instance dimensions
490490
to arrive at a single scalar output. If False, only collapses the batch
491491
dimension and outputs a vector of the same shape as the input.
@@ -530,7 +530,7 @@ def _min_and_max_per_key(
530530
key_vocabulary_filename: Optional[str] = None,
531531
name: Optional[str] = None
532532
) -> Union[Tuple[tf.Tensor, tf.Tensor, tf.Tensor], tf.Tensor]:
533-
"""Computes the min and max of the values of a `Tensor` or `CompositeTensor`.
533+
"""Computes the min and max of the values of `x`.
534534
535535
In the case of a `CompositeTensor` missing values will be used in return
536536
value: for float, NaN is used and for other dtypes the min is used.
@@ -541,10 +541,10 @@ def _min_and_max_per_key(
541541
available in a future version.
542542
543543
Args:
544-
x: A `Tensor` or `CompositeTensor`.
545-
key: A Tensor or `CompositeTensor` of dtype tf.string. If `x` is a
546-
`CompositeTensor`, `key` must exactly match `x` in everything except
547-
values.
544+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
545+
key: A `Tensor`, `SparseTensor`, or `RaggedTensor` of dtype tf.string. If
546+
`x` is a `CompositeTensor`, `key` must exactly match `x` in everything
547+
except values.
548548
reduce_instance_dims: By default collapses the batch and instance dimensions
549549
to arrive at a single scalar output. If False, only collapses the batch
550550
dimension and outputs a vector of the same shape as the input. The False
@@ -628,9 +628,9 @@ def sum( # pylint: disable=redefined-builtin
628628
"""Computes the sum of the values of a `Tensor` over the whole dataset.
629629
630630
Args:
631-
x: A `Tensor` or `CompositeTensor`. Its type must be floating point
632-
(float{16|32|64}),integral (int{8|16|32|64}), or
633-
unsigned integral (uint{8|16})
631+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
632+
point (float{16|32|64}),integral (int{8|16|32|64}), or unsigned
633+
integral (uint{8|16}).
634634
reduce_instance_dims: By default collapses the batch and instance dimensions
635635
to arrive at a single scalar output. If False, only collapses the batch
636636
dimension and outputs a vector of the same shape as the input.
@@ -694,7 +694,7 @@ def histogram(x: common_types.TensorType,
694694
zip(classes, probabilities)))
695695
696696
Args:
697-
x: A `Tensor` or `CompositeTensor`.
697+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
698698
boundaries: (Optional) A `Tensor` or `int` used to build the histogram;
699699
ignored if `categorical` is True. If possible, provide boundaries as
700700
multiple sorted values. Default to 10 intervals over the 0-1 range, or
@@ -746,7 +746,7 @@ def size(x: common_types.TensorType,
746746
"""Computes the total size of instances in a `Tensor` over the whole dataset.
747747
748748
Args:
749-
x: A `Tensor` or `CompositeTensor`.
749+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
750750
reduce_instance_dims: By default collapses the batch and instance dimensions
751751
to arrive at a single scalar output. If False, only collapses the batch
752752
dimension and outputs a vector of the same shape as the input.
@@ -774,7 +774,8 @@ def count_per_key(key: common_types.TensorType,
774774
"""Computes the count of each element of a `Tensor`.
775775
776776
Args:
777-
key: A Tensor or `CompositeTensor` of dtype tf.string or tf.int.
777+
key: A `Tensor`, `SparseTensor`, or `RaggedTensor` of dtype tf.string or
778+
tf.int.
778779
key_vocabulary_filename: (Optional) The file name for the key-output mapping
779780
file. If None and key are provided, this combiner assumes the keys fit in
780781
memory and will not store the result in a file. If empty string, a file
@@ -824,8 +825,8 @@ def mean(x: common_types.TensorType,
824825
"""Computes the mean of the values of a `Tensor` over the whole dataset.
825826
826827
Args:
827-
x: A `Tensor` or `CompositeTensor`. Its type must be floating point
828-
(float{16|32|64}), or integral ([u]int{8|16|32|64}).
828+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
829+
point (float{16|32|64}), or integral ([u]int{8|16|32|64}).
829830
reduce_instance_dims: By default collapses the batch and instance dimensions
830831
to arrive at a single scalar output. If False, only collapses the batch
831832
dimension and outputs a vector of the same shape as the input.
@@ -855,8 +856,8 @@ def var(x: common_types.TensorType,
855856
(x - mean(x))**2 / length(x).
856857
857858
Args:
858-
x: `Tensor` or `CompositeTensor`. Its type must be floating point
859-
(float{16|32|64}), or integral ([u]int{8|16|32|64}).
859+
x: `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
860+
point (float{16|32|64}), or integral ([u]int{8|16|32|64}).
860861
reduce_instance_dims: By default collapses the batch and instance dimensions
861862
to arrive at a single scalar output. If False, only collapses the batch
862863
dimension and outputs a vector of the same shape as the input.
@@ -930,8 +931,8 @@ def tukey_location(x: common_types.TensorType,
930931
Mathematics, vol. 2012, 2012. doi:10.5402/2012/980153
931932
932933
Args:
933-
x: A `Tensor` or `CompositeTensor`. Its type must be floating point
934-
(float{16|32|64}), or integral ([u]int{8|16|32|64}).
934+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
935+
point (float{16|32|64}), or integral ([u]int{8|16|32|64}).
935936
reduce_instance_dims: By default collapses the batch and instance dimensions
936937
to arrive at a single scalar output. If False, only collapses the batch
937938
dimension and outputs a vector of the same shape as the input.
@@ -968,8 +969,8 @@ def tukey_scale(x: common_types.TensorType,
968969
969970
970971
Args:
971-
x: A `Tensor` or `CompositeTensor`. Its type must be floating point
972-
(float{16|32|64}), or integral ([u]int{8|16|32|64}).
972+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
973+
point (float{16|32|64}), or integral ([u]int{8|16|32|64}).
973974
reduce_instance_dims: By default collapses the batch and instance dimensions
974975
to arrive at a single scalar output. If False, only collapses the batch
975976
dimension and outputs a vector of the same shape as the input.
@@ -1005,8 +1006,8 @@ def tukey_h_params(x: common_types.TensorType,
10051006
Mathematics, vol. 2012, 2012. doi:10.5402/2012/980153
10061007
10071008
Args:
1008-
x: A `Tensor` or `CompositeTensor`. Its type must be floating point
1009-
(float{16|32|64}), or integral ([u]int{8|16|32|64}).
1009+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`. Its type must be floating
1010+
point (float{16|32|64}), or integral ([u]int{8|16|32|64}).
10101011
reduce_instance_dims: By default collapses the batch and instance dimensions
10111012
to arrive at a single scalar output. If False, only collapses the batch
10121013
dimension and outputs a vector of the same shape as the input.
@@ -1075,10 +1076,10 @@ def _mean_and_var_per_key(
10751076
"""`mean_and_var` by group, specified by key.
10761077
10771078
Args:
1078-
x: A `Tensor` or `CompositeTensor`.
1079-
key: A Tensor or `CompositeTensor` of dtype tf.string. If `x` is
1080-
a `CompositeTensor`, `key` must exactly match `x` in everything except
1081-
values.
1079+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor`.
1080+
key: A `Tensor`, `SparseTensor`, or `RaggedTensor` of dtype tf.string. If
1081+
`x` is a `CompositeTensor`, `key` must exactly match `x` in everything
1082+
except values.
10821083
reduce_instance_dims: (Optional) By default collapses the batch and instance
10831084
dimensions to arrive at a single scalar output. The False case is not
10841085
currently supported for _mean_and_var_per_key.
@@ -1726,11 +1727,11 @@ def vocabulary(
17261727
file_format: common_types
17271728
.VocabularyFileFormatType = DEFAULT_VOCABULARY_FILE_FORMAT,
17281729
name: Optional[str] = None) -> common_types.TemporaryAnalyzerOutputType:
1729-
r"""Computes the unique values of a `Tensor` over the whole dataset.
1730+
r"""Computes the unique values of `x` over the whole dataset.
17301731
1731-
Computes The unique values taken by `x`, which can be a `Tensor` or
1732-
`CompositeTensor` of any size. The unique values will be aggregated over all
1733-
dimensions of `x` and all instances.
1732+
Computes The unique values taken by `x`, which can be a `Tensor`,
1733+
`SparseTensor`, or `RaggedTensor` of any size. The unique values will be
1734+
aggregated over all dimensions of `x` and all instances.
17341735
17351736
In case `file_format` is 'text' and one of the tokens contains the '\n' or
17361737
'\r' characters or is empty it will be discarded.
@@ -1774,9 +1775,9 @@ def vocabulary(
17741775
within each vocabulary entry (b/117796748).
17751776
17761777
Args:
1777-
x: A categorical/discrete input `Tensor` or `CompositeTensor` with dtype
1778-
tf.string or tf.int[8|16|32|64]. The inputs should generally be unique per
1779-
row (i.e. a bag of words/ngrams representation).
1778+
x: A categorical/discrete input `Tensor`, `SparseTensor`, or `RaggedTensor`
1779+
with dtype tf.string or tf.int[8|16|32|64]. The inputs should generally be
1780+
unique per row (i.e. a bag of words/ngrams representation).
17801781
top_k: Limit the generated vocabulary to the first `top_k` elements. If set
17811782
to None, the full vocabulary is generated.
17821783
frequency_threshold: Limit the generated vocabulary only to elements whose

tensorflow_transform/beam/impl.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -385,11 +385,11 @@ def setup(self):
385385
]
386386

387387
def process(self, batch, saved_model_dir):
388-
"""Runs the given graph to realize the output `Tensor` or `SparseTensor`s.
388+
"""Runs the given graph to realize the outputs.
389389
390390
Runs the graph in a TF session for computing the output values of the
391-
`Tensor` or `SparseTensor`s, given an input row of data (input `Tensor` or
392-
`SparseTensor`s).
391+
`Tensor`s, `SparseTensor`s, or `RaggedTensor`s, given an input row of data
392+
(input `Tensor`s, `SparseTensor`s, or `RaggedTensor`s).
393393
394394
Args:
395395
batch: the batch of elements being processed by the DoFn
@@ -970,7 +970,7 @@ def __init__(self, preprocessing_fn, pipeline=None):
970970
971971
Args:
972972
preprocessing_fn: A function that accepts and returns a dictionary from
973-
strings to `Tensor` or `SparseTensor`s.
973+
strings to `Tensor`s, `SparseTensor`s, or `RaggedTensor`s.
974974
pipeline: (Optional) a beam Pipeline.
975975
"""
976976
self._preprocessing_fn = preprocessing_fn
@@ -1326,7 +1326,7 @@ def __init__(self, preprocessing_fn, output_record_batches=False):
13261326
13271327
Args:
13281328
preprocessing_fn: A function that accepts and returns a dictionary from
1329-
strings to `Tensor` or `SparseTensor`s.
1329+
strings to `Tensor`s, `SparseTensor`s, or `RaggedTensor`s.
13301330
output_record_batches: (Optional) A bool. If `True`,
13311331
`AnalyzeAndTransformDataset` outputs `pyarrow.RecordBatch`es;
13321332
otherwise, outputs instance dicts.

tensorflow_transform/experimental/analyzers.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,8 @@ def _apply_analyzer(ptransform: Union[_BeamPTransform,
114114
Args:
115115
ptransform: A class inheriting from analyzer_nodes.AnalyzerDef or
116116
CacheablePTransformAnalyzer that should be applied.
117-
*tensor_inputs: A list of input `Tensor`s or `CompositeTensor`s.
117+
*tensor_inputs: A list of input `Tensor`s, `SparseTensor`s, or
118+
`RaggedTensor`s.
118119
**analyzer_def_kwargs: KW arguments to use when constructing
119120
analyzer_def_cls.
120121
@@ -315,9 +316,9 @@ def approximate_vocabulary(
315316
name: Optional[str] = None) -> common_types.TemporaryAnalyzerOutputType:
316317
r"""Computes the unique values of a `Tensor` over the whole dataset.
317318
318-
Approximately computes the unique values taken by `x`, which can be a `Tensor`
319-
or `CompositeTensor` of any size. The unique values will be aggregated over
320-
all dimensions of `x` and all instances.
319+
Approximately computes the unique values taken by `x`, which can be a
320+
`Tensor`, `SparseTensor`, or `RaggedTensor` of any size. The unique values
321+
will be aggregated over all dimensions of `x` and all instances.
321322
322323
This analyzer provides an approximate alternative to `tft.vocabulary` that can
323324
be more efficient with smaller `top_k` and/or smaller number of unique
@@ -360,8 +361,8 @@ def approximate_vocabulary(
360361
if `x` is numerical dtype (e.g. [('3', 5), ('2', 3), ('111', 3)]).
361362
362363
Args:
363-
x: A categorical/discrete input `Tensor` or `CompositeTensor` with dtype
364-
tf.string or tf.int[8|16|32|64].
364+
x: A categorical/discrete input `Tensor`, `SparseTensor`, or `RaggedTensor`
365+
with dtype tf.string or tf.int[8|16|32|64].
365366
top_k: Limit the generated vocabulary to the first `top_k` elements. Note
366367
that if `top_k` is larger than the number of unique elements in `x`, then
367368
the result will be exact.
@@ -526,7 +527,7 @@ def _get_approximate_vocabulary_analyzer_inputs(
526527
"""Helper for constructing approximate vocabulary inputs from tensors.
527528
528529
Args:
529-
x: `Tensor` or `CompositeTensor` to compute vocabulary over.
530+
x: `Tensor`, `SparseTensor`, or `RaggedTensor` to compute vocabulary over.
530531
file_format: The format of the resulting vocabulary file.
531532
'tfrecord_gzip' requires tensorflow>=2.4.
532533
weights: Optional `Tensor` of weights.

tensorflow_transform/experimental/mappers.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ def compute_and_apply_approximate_vocabulary(
5151
"""Generates an approximate vocabulary for `x` and maps it to an integer.
5252
5353
Args:
54-
x: A `Tensor` or `CompositeTensor` of type tf.string or tf.int[8|16|32|64].
54+
x: A `Tensor`, `SparseTensor`, or `RaggedTensor` of type tf.string or
55+
tf.int[8|16|32|64].
5556
default_value: The value to use for out-of-vocabulary values, unless
5657
'num_oov_buckets' is greater than zero.
5758
top_k: Limit the generated vocabulary to the first `top_k` elements. If set
@@ -73,13 +74,14 @@ def compute_and_apply_approximate_vocabulary(
7374
name: (Optional) A name for this operation.
7475
7576
Returns:
76-
A `Tensor` or `CompositeTensor` where each string value is mapped to an
77-
integer. Each unique string value that appears in the vocabulary
78-
is mapped to a different integer and integers are consecutive starting from
79-
zero. String value not in the vocabulary is assigned default_value.
80-
Alternatively, if num_oov_buckets is specified, out of vocabulary strings
81-
are hashed to values in [vocab_size, vocab_size + num_oov_buckets) for an
82-
overall range of [0, vocab_size + num_oov_buckets).
77+
A `Tensor`, `SparseTensor`, or `RaggedTensor` where each string value is
78+
mapped to an integer. Each unique string value that appears in the
79+
vocabulary is mapped to a different integer and integers are consecutive
80+
starting from zero. String value not in the vocabulary is assigned
81+
`default_value`. Alternatively, if `num_oov_buckets` is specified, out of
82+
vocabulary strings are hashed to values in
83+
[vocab_size, vocab_size + num_oov_buckets) for an overall range of
84+
[0, vocab_size + num_oov_buckets).
8385
8486
Raises:
8587
ValueError: If `top_k` is negative.

0 commit comments

Comments
 (0)