Make annotate_sparse_output_shape support input shapes as a tensor, and simplify shape storage in collections by saving tensors.

zoyahav · tfx-copybara · commit 96f96e6b0e0e · 2023-02-20T04:20:45.000-08:00
This change also updates the census example to output one-hot tensors as sparse for efficiency.

PiperOrigin-RevId: 510962121
diff --git a/examples/census_example.py b/examples/census_example.py
@@ -26,6 +26,14 @@
 # Functions for training
 
 
+def _make_inputs_dense(transformed_features):
+  return {
+      k: tf.sparse.to_dense(v) if isinstance(v, tf.SparseTensor) else v
+      for k, v in transformed_features.items()
+  }
+# pylint: disable=g-deprecated-tf-checker
+
+
 def _make_training_input_fn(tf_transform_output, transformed_examples,
                             batch_size):
   """Creates an input function reading from transformed data.
@@ -47,8 +55,9 @@ def input_fn():
         reader=tf.data.TFRecordDataset,
         shuffle=True)
 
-    transformed_features = tf.compat.v1.data.make_one_shot_iterator(
-        dataset).get_next()
+    transformed_features = _make_inputs_dense(
+        tf.compat.v1.data.make_one_shot_iterator(dataset).get_next()
+    )
 
     # Extract features and label from the transformed tensors.
     # TODO(b/30367437): make transformed_labels a dict.
@@ -86,8 +95,9 @@ def serving_input_fn():
     # Apply the transform function that was used to generate the materialized
     # data.
     raw_features = serving_input_receiver.features
-    transformed_features = tf_transform_output.transform_raw_features(
-        raw_features)
+    transformed_features = _make_inputs_dense(
+        tf_transform_output.transform_raw_features(raw_features)
+    )
 
     return tf_estimator.export.ServingInputReceiver(
         transformed_features, serving_input_receiver.receiver_tensors)
@@ -106,8 +116,13 @@ def get_feature_columns(tf_transform_output):
   """
   feature_spec = tf_transform_output.transformed_feature_spec()
   # Wrap scalars as real valued columns.
+  def get_shape(spec):
+    if isinstance(spec, tf.io.SparseFeature):
+      return spec.size
+    return spec.shape
+
   return [
-      tf.feature_column.numeric_column(key, shape=feature_spec[key].shape)
+      tf.feature_column.numeric_column(key, shape=get_shape(feature_spec[key]))
       for key in (common.NUMERIC_FEATURE_KEYS + common.CATEGORICAL_FEATURE_KEYS)
   ]
 
diff --git a/examples/census_example_common.py b/examples/census_example_common.py
@@ -140,11 +140,16 @@ def preprocessing_fn(inputs):
       one_hot_encoded = tf.one_hot(
           integerized,
           depth=tf.cast(depth, tf.int32),
-          on_value=1.0,
-          off_value=0.0)
-      # This output is now one-hot encoded. If saving transformed data to disk,
-      # this can incur significant memory cost.
-      outputs[key] = tf.reshape(one_hot_encoded, [-1, depth])
+          on_value=1,
+          off_value=0,
+          dtype=tf.int64)
+      # Saving one-hot encoded outputs as sparse in order to avoid large dense
+      # (mostly empty) tensors. This is especially important when saving
+      # transformed data to disk.
+      outputs[key] = tf.sparse.from_dense(
+          tf.reshape(one_hot_encoded, [-1, depth])
+      )
+      tft.experimental.annotate_sparse_output_shape(outputs[key], depth)
 
     # For the label column we provide the mapping from string to index.
     table_keys = ['>50K', '<=50K']
diff --git a/examples/census_example_v2.py b/examples/census_example_v2.py
@@ -187,16 +187,28 @@ def train_and_evaluate(raw_train_eval_data_path_pattern,
   feature_spec.pop(common.LABEL_KEY)
 
   inputs = {}
+  sparse_inputs = {}
+  dense_inputs = {}
   for key, spec in feature_spec.items():
     if isinstance(spec, tf.io.FixedLenFeature):
       # TODO(b/208879020): Move into schema such that spec.shape is [1] and not
       # [] for scalars.
       inputs[key] = tf.keras.layers.Input(
           shape=spec.shape or [1], name=key, dtype=spec.dtype)
+      dense_inputs[key] = inputs[key]
+    elif isinstance(spec, tf.io.SparseFeature):
+      inputs[key] = tf.keras.layers.Input(
+          shape=spec.size, name=key, dtype=spec.dtype, sparse=True
+      )
+      sparse_inputs[key] = inputs[key]
     else:
       raise ValueError('Spec type is not supported: ', key, spec)
 
-  stacked_inputs = tf.concat(tf.nest.flatten(inputs), axis=1)
+  outputs = [
+      tf.keras.layers.Dense(10, activation='relu')(x)
+      for x in tf.nest.flatten(sparse_inputs)
+  ]
+  stacked_inputs = tf.concat(tf.nest.flatten(dense_inputs) + outputs, axis=1)
   output = tf.keras.layers.Dense(100, activation='relu')(stacked_inputs)
   output = tf.keras.layers.Dense(70, activation='relu')(output)
   output = tf.keras.layers.Dense(50, activation='relu')(output)
diff --git a/tensorflow_transform/beam/annotators_test.py b/tensorflow_transform/beam/annotators_test.py
@@ -36,7 +36,7 @@ def preprocessing_fn(inputs):
       outputs = inputs.copy()
       x = tf.sparse.expand_dims(inputs['x'], -1)
       outputs['x'] = x
-      tft.experimental.annotate_sparse_output_shape(x, [1, 1])
+      tft.experimental.annotate_sparse_output_shape(x, tf.constant([1, 1]))
       tft.experimental.annotate_sparse_output_shape(outputs['y'], [17])
       tft.experimental.annotate_true_sparse_output(outputs['z'])
       return outputs
diff --git a/tensorflow_transform/experimental/annotators.py b/tensorflow_transform/experimental/annotators.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """Experimental APIs to get annotations."""
 
-from typing import Sequence
+from typing import Sequence, Union
 
 import tensorflow as tf
 from tensorflow_transform import annotators
@@ -87,26 +87,37 @@ def get_vocabulary_size_by_name(vocab_filename: str) -> tf.Tensor:
   return result
 
 
-def annotate_sparse_output_shape(tensor: tf.SparseTensor, shape: Sequence[int]):
+def annotate_sparse_output_shape(
+    tensor: tf.SparseTensor, shape: Union[Sequence[int], tf.Tensor]):
   """Annotates a sparse output to have a given dense_shape.
 
   Args:
     tensor: An `SparseTensor` to be annotated.
     shape: A dense_shape to annotate `tensor` with. Note that this shape does
       not include batch_size.
   """
-  if len(shape) != tensor.shape.rank - 1:
+  if not isinstance(shape, tf.Tensor):
+    if (tensor.shape.rank > 1 and tensor.shape.rank - 1 != len(shape)) or (
+        tensor.shape.rank == 1 and len(shape) != 1):
+      raise ValueError(
+          f'Annotated shape {shape} was expected to have rank'
+          f' {tensor.shape.rank - 1}')
+    if not all(a is None or a <= b for a, b in zip(tensor.shape[1:], shape)):
+      raise ValueError(
+          f'Shape {shape} cannot contain annotated tensor {tensor}')
+    shape = tf.convert_to_tensor(shape, dtype=tf.int64)
+  elif shape.shape.rank > 1 or (
+      shape.shape.rank == 1 and shape.shape[0] != tensor.shape.rank - 1):
     raise ValueError(
-        f'Annotated shape {shape} was expected to have rank'
-        f' {tensor.shape.rank - 1}'
-    )
-  if not all(a is None or a <= b for a, b in zip(tensor.shape[1:], shape)):
-    raise ValueError(f'Shape {shape} cannot contain annotated tensor {tensor}')
+        f'Annotation shape has rank {shape.shape.rank} but expected to have'
+        f' rank {tensor.shape.rank - 1}')
+  if shape.shape.rank < 1:
+    shape = tf.expand_dims(shape, -1)
   # There's currently no way to override SparseTensor.dense_shape directly,
   # unless composing and returning a new SparseTensor.
-  tensor._dense_shape = tf.convert_to_tensor(  # pylint: disable=protected-access
-      [tensor.dense_shape[0]] + list(shape), dtype=tf.int64
-  )
+  tensor._dense_shape = tf.concat(  # pylint: disable=protected-access
+      [tf.expand_dims(tensor.dense_shape[0], -1), tf.cast(shape, tf.int64)],
+      axis=0)
   schema_inference.annotate_sparse_output_shape(tensor, shape)
 
 
diff --git a/tensorflow_transform/schema_inference.py b/tensorflow_transform/schema_inference.py
@@ -20,9 +20,8 @@
 """
 
 import collections
-import functools
 import itertools
-from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Union
 from absl import logging
 
 import tensorflow as tf
@@ -188,7 +187,7 @@ def infer_feature_schema(
     tensor_ranges = session.run(tensor_ranges)
     tensor_annotations, global_annotations = _get_schema_annotations(
         graph, session)
-  sparse_output_annotations = _get_sparse_output_annotations_v1(graph)
+  sparse_output_annotations = _get_sparse_output_annotations_v1(graph, session)
   modified_sparse_output_annotations = {}
   modified_tensor_ranges = {}
   feature_annotations = {}
@@ -790,24 +789,14 @@ def metadata_fn():
   return module.metadata_fn
 
 _ANNOTATED_SPARSE_SHAPE_TENSORS = 'annotated_sparse_shape_tensors'
-_ANNOTATED_SPARSE_SHAPE_RANKS = 'annotated_sparse_shape_ranks'
-_ANNOTATED_SPARSE_SHAPE_DIMENSIONS = 'annotated_sparse_shape_dimensions'
+_ANNOTATED_SPARSE_SHAPES = 'annotated_sparse_shape_dimensions'
 _ANNOTATED_TRUELY_SPARSE_TENSORS = 'annotated_truely_sparse_tensors'
 
 
-def annotate_sparse_output_shape(tensor: tf.SparseTensor, shape: Sequence[int]):
+def annotate_sparse_output_shape(tensor: tf.SparseTensor, shape: tf.Tensor):
   """Annotates a sparse output with a given shape."""
-  if tensor.shape.rank - 1 != len(shape):
-    raise ValueError(
-        f'Output {tensor} was annotated with an incompatible shape: {shape}'
-    )
   tf.compat.v1.add_to_collection(_ANNOTATED_SPARSE_SHAPE_TENSORS, tensor.values)
-  # We store rank and dimensions to separate collections since TF collections
-  # don't allow storing lists. This can be simplified if we switch away from
-  # collections.
-  tf.compat.v1.add_to_collection(_ANNOTATED_SPARSE_SHAPE_RANKS, len(shape))
-  for dim in shape:
-    tf.compat.v1.add_to_collection(_ANNOTATED_SPARSE_SHAPE_DIMENSIONS, dim)
+  tf.compat.v1.add_to_collection(_ANNOTATED_SPARSE_SHAPES, shape)
 
 
 def annotate_true_sparse_output(tensor: tf.SparseTensor):
@@ -818,26 +807,16 @@ def annotate_true_sparse_output(tensor: tf.SparseTensor):
 
 
 def _extract_true_sparse_annotations(
-    graph: tf.compat.v1.Graph,
-) -> List[tf.Tensor]:
+    graph: tf.compat.v1.Graph) -> List[tf.Tensor]:
   """Extracts true sparse annotations from the graph."""
   return graph.get_collection(_ANNOTATED_TRUELY_SPARSE_TENSORS)
 
 
 def _extract_sparse_output_annotations(
-    graph: tf.compat.v1.Graph,
-) -> List[Tuple[tf.Tensor, List[tf.Tensor]]]:
+    graph: tf.compat.v1.Graph) -> List[Tuple[tf.Tensor, List[tf.Tensor]]]:
   """Extracts sparse output annotations from the graph."""
   tensors = graph.get_collection(_ANNOTATED_SPARSE_SHAPE_TENSORS)
-  ranks = graph.get_collection(_ANNOTATED_SPARSE_SHAPE_RANKS)
-  assert len(tensors) == len(ranks), f'{tensors} != {ranks}'
-  shape_flattened = graph.get_collection(_ANNOTATED_SPARSE_SHAPE_DIMENSIONS)
-
-  # Splitting dimensions per annotated tensor.
-  splits = functools.reduce(lambda lst, x: lst + [lst[-1] + x], ranks, [0])
-  # Composing the annotated shape per tensor.
-  shapes = tuple(shape_flattened[s:e] for s, e in zip(splits[:-1], splits[1:]))
-
+  shapes = graph.get_collection(_ANNOTATED_SPARSE_SHAPES)
   assert len(tensors) == len(shapes), f'{tensors} != {shapes}'
   return list(zip(tensors, shapes))
 
@@ -852,7 +831,7 @@ def _get_sparse_output_annotations(
   return list(
       itertools.chain(
           (
-              (a, [''])
+              (a, tf.constant(['']))
               for a in _extract_true_sparse_annotations(graph)
               if a.ref() not in annotated_refs
           ),
@@ -862,12 +841,15 @@ def _get_sparse_output_annotations(
 
 
 def _get_sparse_output_annotations_v1(
-    graph: tf.compat.v1.Graph,
+    graph: tf.compat.v1.Graph, session: Optional[tf.compat.v1.Session]
 ) -> Dict[Any, List[Union[str, tf.Tensor]]]:
-  return {
-      tf_utils.hashable_tensor_or_op(kv[0]): kv[1]
-      for kv in _get_sparse_output_annotations(graph)
-  }
+  if not session:
+    return {}
+  else:
+    return {
+        tf_utils.hashable_tensor_or_op(kv[0]): session.run(kv[1])
+        for kv in _get_sparse_output_annotations(graph)
+    }
 
 
 def _get_sparse_output_annotations_v2(