Implement output sparse tensor annotations: for setting a dense_shape and force representing 2d sparse tensors as sparse as opposed to varlen.

zoyahav · tfx-copybara · commit 935fcf05690a · 2023-02-10T06:38:05.000-08:00
PiperOrigin-RevId: 508646165
diff --git a/RELEASE.md b/RELEASE.md
@@ -7,6 +7,9 @@
 *   `RaggedTensor`s can now be automatically inferred for variable length
     features by setting `represent_variable_length_as_ragged=true` in TFMD
     schema.
+*   New experimental APIs added for annotating sparse output tensors:
+    `tft.experimental.annotate_sparse_output_shape` and
+    `tft.experimental.annotate_true_sparse_output`.
 
 ## Bug Fixes and Other Changes
 
diff --git a/tensorflow_transform/beam/annotators_test.py b/tensorflow_transform/beam/annotators_test.py
@@ -0,0 +1,221 @@
+# coding=utf-8
+#
+# Copyright 2023 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for tft annotators."""
+
+import tensorflow as tf
+import tensorflow_transform as tft
+from tensorflow_transform.beam import tft_unit
+from google.protobuf import text_format
+from tensorflow_metadata.proto.v0 import schema_pb2
+
+
+_TF_VERSION_NAMED_PARAMETERS = [
+    dict(testcase_name='CompatV1', use_tf_compat_v1=True),
+    dict(testcase_name='V2', use_tf_compat_v1=False),
+]
+
+
+class AnnotatorsTest(tft_unit.TransformTestCase):
+
+  @tft_unit.named_parameters(*_TF_VERSION_NAMED_PARAMETERS)
+  def test_annotate_sparse_outputs(self, use_tf_compat_v1):
+    def preprocessing_fn(inputs):
+      outputs = inputs.copy()
+      x = tf.sparse.expand_dims(inputs['x'], -1)
+      outputs['x'] = x
+      tft.experimental.annotate_sparse_output_shape(x, [1, 1])
+      tft.experimental.annotate_sparse_output_shape(outputs['y'], [17])
+      tft.experimental.annotate_true_sparse_output(outputs['z'])
+      return outputs
+
+    input_data_dicts = [dict(x=[1], y=[2], z=[3], t=[4]) for x in range(10)]
+    input_metadata = tft.DatasetMetadata.from_feature_spec({
+        'x': tf.io.VarLenFeature(tf.int64),
+        'y': tf.io.VarLenFeature(tf.int64),
+        'z': tf.io.VarLenFeature(tf.int64),
+        't': tf.io.VarLenFeature(tf.int64),
+    })
+    schema = text_format.Parse(
+        """
+        feature {
+          name: "t"
+          type: INT
+        }
+        feature {
+          name: "x$sparse_indices_0"
+          type: INT
+          int_domain {
+            min: 0
+            max: 0
+          }
+        }
+        feature {
+          name: "x$sparse_indices_1"
+          type: INT
+          int_domain {
+            min: 0
+            max: 0
+          }
+        }
+        feature {
+          name: "x$sparse_values"
+          type: INT
+        }
+        feature {
+          name: "y$sparse_indices_0"
+          type: INT
+          int_domain {
+            min: 0
+            max: 16
+          }
+        }
+        feature {
+          name: "y$sparse_values"
+          type: INT
+        }
+        feature {
+          name: "z$sparse_indices_0"
+          type: INT
+        }
+        feature {
+          name: "z$sparse_values"
+          type: INT
+        }
+        sparse_feature {
+          name: "x"
+          index_feature {
+            name: "x$sparse_indices_0"
+          }
+          index_feature {
+            name: "x$sparse_indices_1"
+          }
+          is_sorted: true
+          value_feature {
+            name: "x$sparse_values"
+          }
+        }
+        sparse_feature {
+          name: "y"
+          index_feature {
+            name: "y$sparse_indices_0"
+          }
+          is_sorted: true
+          value_feature {
+            name: "y$sparse_values"
+          }
+        }
+        sparse_feature {
+          name: "z"
+          index_feature {
+            name: "z$sparse_indices_0"
+          }
+          is_sorted: true
+          value_feature {
+            name: "z$sparse_values"
+          }
+        }
+    """,
+        schema_pb2.Schema(),
+    )
+    if not tft_unit.is_external_environment():
+      schema.generate_legacy_feature_spec = False
+    self.assertAnalyzeAndTransformResults(
+        input_data_dicts,
+        input_metadata,
+        preprocessing_fn,
+        expected_metadata=tft.DatasetMetadata(schema),
+        force_tf_compat_v1=use_tf_compat_v1,
+        output_record_batches=True,
+    )
+
+  @tft_unit.named_parameters(*_TF_VERSION_NAMED_PARAMETERS)
+  def test_conflicting_sparse_outputs_annotations(self, use_tf_compat_v1):
+    def preprocessing_fn(inputs):
+      tft.experimental.annotate_sparse_output_shape(inputs['x'], [3])
+      tft.experimental.annotate_sparse_output_shape(inputs['x'], [17])
+      tft.experimental.annotate_true_sparse_output(inputs['x'])
+      return inputs
+
+    input_data_dicts = [dict(x=[1]) for x in range(10)]
+    input_metadata = tft.DatasetMetadata.from_feature_spec(
+        {
+            'x': tf.io.VarLenFeature(tf.int64),
+        }
+    )
+    schema = text_format.Parse(
+        """
+      feature {
+        name: "x$sparse_indices_0"
+        type: INT
+        int_domain {
+          min: 0
+          max: 16
+        }
+      }
+      feature {
+        name: "x$sparse_values"
+        type: INT
+      }
+      sparse_feature {
+        name: "x"
+        index_feature {
+          name: "x$sparse_indices_0"
+        }
+        is_sorted: true
+        value_feature {
+          name: "x$sparse_values"
+        }
+      }
+    """,
+        schema_pb2.Schema(),
+    )
+    if not tft_unit.is_external_environment():
+      schema.generate_legacy_feature_spec = False
+    self.assertAnalyzeAndTransformResults(
+        input_data_dicts,
+        input_metadata,
+        preprocessing_fn,
+        expected_metadata=tft.DatasetMetadata(schema),
+        force_tf_compat_v1=use_tf_compat_v1,
+        output_record_batches=True,
+    )
+
+  @tft_unit.named_parameters(*_TF_VERSION_NAMED_PARAMETERS)
+  def test_invalid_sparse_outputs_annotations(self, use_tf_compat_v1):
+    def preprocessing_fn(inputs):
+      tft.experimental.annotate_sparse_output_shape(inputs['x'], [3, 42])
+      return inputs
+
+    input_data_dicts = [dict(x=[1]) for x in range(10)]
+    input_metadata = tft.DatasetMetadata.from_feature_spec(
+        {
+            'x': tf.io.VarLenFeature(tf.int64),
+        }
+    )
+    with self.assertRaisesRegex(  # pylint: disable=g-error-prone-assert-raises
+        ValueError,
+        r'Annotated shape \[3, 42\] was expected to have rank 1',
+    ):
+      self.assertAnalyzeAndTransformResults(
+          input_data_dicts,
+          input_metadata,
+          preprocessing_fn,
+          force_tf_compat_v1=use_tf_compat_v1,
+      )
+
+
+if __name__ == '__main__':
+  tft_unit.main()
diff --git a/tensorflow_transform/experimental/annotators.py b/tensorflow_transform/experimental/annotators.py
@@ -13,12 +13,22 @@
 # limitations under the License.
 """Experimental APIs to get annotations."""
 
+from typing import Sequence
+
 import tensorflow as tf
 from tensorflow_transform import annotators
+from tensorflow_transform import schema_inference
 
 from tensorflow.python.framework import ops  # pylint: disable=g-direct-tensorflow-import
 
 
+__all__ = [
+    'get_vocabulary_size_by_name',
+    'annotate_sparse_output_shape',
+    'annotate_true_sparse_output',
+]
+
+
 def get_vocabulary_size_by_name(vocab_filename: str) -> tf.Tensor:
   # pyformat: disable
   """Gets the size of a vocabulary created using `tft.vocabulary`.
@@ -75,3 +85,31 @@ def get_vocabulary_size_by_name(vocab_filename: str) -> tf.Tensor:
         '`vocab_filename` argument passed to it.')
 
   return result
+
+
+def annotate_sparse_output_shape(tensor: tf.SparseTensor, shape: Sequence[int]):
+  """Annotates a sparse output to have a given dense_shape.
+
+  Args:
+    tensor: An `SparseTensor` to be annotated.
+    shape: A dense_shape to annotate `tensor` with. Note that this shape does
+      not include batch_size.
+  """
+  if len(shape) != tensor.shape.rank - 1:
+    raise ValueError(
+        f'Annotated shape {shape} was expected to have rank'
+        f' {tensor.shape.rank - 1}'
+    )
+  if not all(a is None or a <= b for a, b in zip(tensor.shape[1:], shape)):
+    raise ValueError(f'Shape {shape} cannot contain annotated tensor {tensor}')
+  # There's currently no way to override SparseTensor.dense_shape directly,
+  # unless composing and returning a new SparseTensor.
+  tensor._dense_shape = tf.convert_to_tensor(  # pylint: disable=protected-access
+      [tensor.dense_shape[0]] + list(shape), dtype=tf.int64
+  )
+  schema_inference.annotate_sparse_output_shape(tensor, shape)
+
+
+def annotate_true_sparse_output(tensor: tf.SparseTensor):
+  """Annotates a sparse output to be truely sparse and not varlen."""
+  schema_inference.annotate_true_sparse_output(tensor)
diff --git a/tensorflow_transform/schema_inference.py b/tensorflow_transform/schema_inference.py