Extended _min_and_max_per_key to support element-wise reduction (reduce_instance_dims=False).

tf-transform-team · tfx-copybara · commit 0d03e7730c15 · 2022-06-14T10:16:48.000-07:00
PiperOrigin-RevId: 454894071
diff --git a/tensorflow_transform/analyzers.py b/tensorflow_transform/analyzers.py
@@ -348,6 +348,12 @@ def _get_output_shape_from_input(x):
   return (None,)
 
 
+def _get_elementwise_per_key_output_shape(
+    x: tf.Tensor, key: Optional[tf.Tensor]) -> Optional[Tuple[int]]:
+  shape = x.get_shape() if key is None else x.get_shape()[1:]
+  return tuple(shape) if shape.is_fully_defined() else None
+
+
 # TODO(b/112414577): Go back to accepting only a single input.
 # Currently we accept multiple inputs so that we can implement min and max
 # with a single combiner. Once this is done, add a return pytype as well.
@@ -401,8 +407,7 @@ def _numeric_combine(inputs: List[tf.Tensor],
   else:
     # Reducing over batch dimensions.
     output_shapes = [
-        (tuple(x.get_shape()) if x.get_shape().is_fully_defined() else None)
-        for x in inputs
+        _get_elementwise_per_key_output_shape(x, key) for x in inputs
     ]
   combiner = NumPyCombiner(fn, default_accumulator_value,
                            [dtype.as_numpy_dtype for dtype in output_dtypes],
@@ -414,8 +419,8 @@ def _numeric_combine(inputs: List[tf.Tensor],
     return _apply_cacheable_combiner_per_key(combiner, key, *inputs)
 
   return _apply_cacheable_combiner_per_key_large(
-      combiner, _maybe_get_per_key_vocab_filename(key_vocabulary_filename),
-      key, *inputs)
+      combiner, _maybe_get_per_key_vocab_filename(key_vocabulary_filename), key,
+      *inputs)
 
 
 @common.log_api_use(common.ANALYZER_COLLECTION)
@@ -565,8 +570,10 @@ def _min_and_max_per_key(
   if key is None:
     raise ValueError('A key is required for _min_and_max_per_key')
 
-  if not reduce_instance_dims:
-    raise NotImplementedError('Per-key elementwise reduction not supported')
+  if not reduce_instance_dims and isinstance(
+      x, (tf.SparseTensor, tf.RaggedTensor)):
+    raise NotImplementedError(
+        'Per-key elementwise reduction of Composite Tensors not supported ')
 
   with tf.compat.v1.name_scope(name, 'min_and_max_per_key'):
     output_dtype = x.dtype
@@ -582,7 +589,8 @@ def _min_and_max_per_key(
                                    -output_dtype.max)
 
     key_vocab, x_batch_minus_min, x_batch_max = (
-        tf_utils.reduce_batch_minus_min_and_max_per_key(x, key))
+        tf_utils.reduce_batch_minus_min_and_max_per_key(x, key,
+                                                        reduce_instance_dims))
 
     key_values = _numeric_combine(  # pylint: disable=unbalanced-tuple-unpacking
         inputs=[x_batch_minus_min, x_batch_max],
diff --git a/tensorflow_transform/beam/impl_test.py b/tensorflow_transform/beam/impl_test.py
@@ -1919,6 +1919,157 @@ def analyzer_fn(inputs):
         expected_outputs,
         desired_batch_size=10)
 
+  @tft_unit.named_parameters(
+      dict(
+          testcase_name='_dense_2d',
+          input_data=[{
+              'x': [4, 8],
+              'key': 'a'
+          }, {
+              'x': [1, 5],
+              'key': 'a'
+          }, {
+              'x': [5, 9],
+              'key': 'a'
+          }, {
+              'x': [2, 6],
+              'key': 'a'
+          }, {
+              'x': [-2, 0],
+              'key': 'b'
+          }, {
+              'x': [0, 2],
+              'key': 'b'
+          }, {
+              'x': [2, 4],
+              'key': 'b'
+          }],
+          input_metadata=tft.DatasetMetadata.from_feature_spec({
+              'x': tf.io.FixedLenFeature([2], tf.float32),
+              'key': tf.io.FixedLenFeature([], tf.string),
+          }),
+          reduce_instance_dims=True,
+          expected_outputs={
+              'key_vocab': np.array([b'a', b'b'], np.object),
+              'min_x_value': np.array([1, -2], np.float32),
+              'max_x_value': np.array([9, 4], np.float32),
+          }),
+      dict(
+          testcase_name='_dense_2d_elementwise',
+          input_data=[{
+              'x': [4, 8],
+              'key': 'a'
+          }, {
+              'x': [1, 5],
+              'key': 'a'
+          }, {
+              'x': [5, 9],
+              'key': 'a'
+          }, {
+              'x': [2, 6],
+              'key': 'a'
+          }, {
+              'x': [-2, 0],
+              'key': 'b'
+          }, {
+              'x': [0, 2],
+              'key': 'b'
+          }, {
+              'x': [2, 4],
+              'key': 'b'
+          }],
+          input_metadata=tft.DatasetMetadata.from_feature_spec({
+              'x': tf.io.FixedLenFeature([2], tf.float32),
+              'key': tf.io.FixedLenFeature([], tf.string),
+          }),
+          reduce_instance_dims=False,
+          expected_outputs={
+              'key_vocab': np.array([b'a', b'b'], np.object),
+              'min_x_value': np.array([[1, 5], [-2, 0]], np.float32),
+              'max_x_value': np.array([[5, 9], [2, 4]], np.float32),
+          }),
+      dict(
+          testcase_name='_dense_3d',
+          input_data=[
+              {
+                  'x': [[1, 5], [1, 1]],
+                  'key': 'a'
+              },
+              {
+                  'x': [[5, 1], [5, 5]],
+                  'key': 'a'
+              },
+              {
+                  'x': [[2, 2], [2, 5]],
+                  'key': 'a'
+              },
+              {
+                  'x': [[3, -3], [3, 3]],
+                  'key': 'b'
+              },
+          ],
+          input_metadata=tft.DatasetMetadata.from_feature_spec({
+              'x': tf.io.FixedLenFeature([2, 2], tf.float32),
+              'key': tf.io.FixedLenFeature([], tf.string),
+          }),
+          reduce_instance_dims=True,
+          expected_outputs={
+              'key_vocab': np.array([b'a', b'b'], np.object),
+              'min_x_value': np.array([1, -3], np.float32),
+              'max_x_value': np.array([5, 3], np.float32),
+          }),
+      dict(
+          testcase_name='_dense_3d_elementwise',
+          input_data=[
+              {
+                  'x': [[1, 5], [1, 1]],
+                  'key': 'a'
+              },
+              {
+                  'x': [[5, 1], [5, 5]],
+                  'key': 'a'
+              },
+              {
+                  'x': [[2, 2], [2, 5]],
+                  'key': 'a'
+              },
+              {
+                  'x': [[3, -3], [3, 3]],
+                  'key': 'b'
+              },
+          ],
+          input_metadata=tft.DatasetMetadata.from_feature_spec({
+              'x': tf.io.FixedLenFeature([2, 2], tf.float32),
+              'key': tf.io.FixedLenFeature([], tf.string),
+          }),
+          reduce_instance_dims=False,
+          expected_outputs={
+              'key_vocab':
+                  np.array([b'a', b'b'], np.object),
+              'min_x_value':
+                  np.array([[[1, 1], [1, 1]], [[3, -3], [3, 3]]], np.float32),
+              'max_x_value':
+                  np.array([[[5, 5], [5, 5]], [[3, -3], [3, 3]]], np.float32),
+          }),
+  )
+  def testMinAndMaxPerKey(self, input_data, input_metadata,
+                          reduce_instance_dims, expected_outputs):
+    self._SkipIfOutputRecordBatches()
+
+    def analyzer_fn(inputs):
+      key_vocab, min_x_value, max_x_value = analyzers._min_and_max_per_key(
+          x=inputs['x'],
+          key=inputs['key'],
+          reduce_instance_dims=reduce_instance_dims)
+      return {
+          'key_vocab': key_vocab,
+          'min_x_value': min_x_value,
+          'max_x_value': max_x_value,
+      }
+
+    self.assertAnalyzerOutputs(input_data, input_metadata, analyzer_fn,
+                               expected_outputs)
+
   @tft_unit.parameters((True,), (False,))
   def testPerKeyWithOOVKeys(self, use_vocabulary):
     def preprocessing_fn(inputs):