Slice transformed data batches into smaller chunks if their size exceeds 200MB.

iindyk · tfx-copybara · commit d4c8b9204c34 · 2023-07-28T11:22:00.000-07:00
With TFXIO inputs Transform does not control input batch size. Output batch size is given by the input batch size + preprocessing_fn logic. preprocessing_fn can increase the size of the data; this combined with aggressively batched inputs can result in large output batches that may cause troubles downstream.

PiperOrigin-RevId: 551903601
diff --git a/RELEASE.md b/RELEASE.md
@@ -12,6 +12,8 @@
 
 *   `approximate_vocabulary` now returns tokens with the same frequency in
     reverse lexicographical order (similarly to `tft.vocabulary`).
+*   Transformed data batches are now sliced into smaller chunks if their size
+    exceeds 200MB.
 *   Depends on `pyarrow>=10,<11`.
 *   Depends on `apache-beam>=2.47,<3`.
 *   Depends on `numpy>=1.22.0`.
diff --git a/tensorflow_transform/beam/impl.py b/tensorflow_transform/beam/impl.py
@@ -126,6 +126,20 @@
     fn_api_runner.FnApiRunner: _FIXED_PARALLELISM_TF_CONFIG,
 }
 
+# Batches larger than this will be sliced into smaller chunks. This size
+# constraint must be at least as strict as the following constraints:
+#   1. Number of elements in each individual array of the batch must be less
+#      than or equal to 2^31 - 1. Beam's `pa.RecordBatch` PCoder does not
+#      support larger sizes (even though the produced containers such as
+#      LargeListArray and LargeBinaryArray support them).
+#   2. Serialized size of the batch must be less than 2GB. Beam's  shuffle
+#      stage will wrap the serialized batches into a proto for materialization.
+#      2GB is the proto size limit.
+# We set a much stricter limit than the above to additionaly improve the outputs
+# handling by making the size distributed over larger number of (still
+# reasonably big) batches.
+_MAX_TRANSFORMED_BATCH_BYTES_SIZE = 200 << 10 << 10  # 200MB
+
 # TODO(b/68154497): pylint: disable=no-value-for-parameter
 
 
@@ -412,6 +426,34 @@ def _warn_about_tf_compat_v1():
       'Features such as tf.function may not work as intended.')
 
 
+def _maybe_slice_large_record_batch(
+    record_batch: pa.RecordBatch,
+) -> Iterable[pa.RecordBatch]:
+  """Slices large batches into smaller chunks."""
+  if record_batch.nbytes > _MAX_TRANSFORMED_BATCH_BYTES_SIZE:
+    if record_batch.num_rows < 2:
+      logging.warning(
+          'Transformed data row may be too large: %d bytes. '
+          'Consider reshaping outputs to distribute elements over a larger '
+          'number of rows to allow automatic slicing.',
+          record_batch.nbytes,
+      )
+      yield record_batch
+      return
+    # Note that slicing is a zero-copy operation, so the produced batches will
+    # still share memory with the original one up to the materialization
+    # boundary.
+    mid_point = record_batch.num_rows // 2
+    yield from _maybe_slice_large_record_batch(
+        record_batch.slice(offset=0, length=mid_point)
+    )
+    yield from _maybe_slice_large_record_batch(
+        record_batch.slice(offset=mid_point)
+    )
+  else:
+    yield record_batch
+
+
 def _convert_to_record_batch(
     batch_dict: Dict[str, Union[common_types.TensorValueType, pa.Array]],
     converter: tensor_to_arrow.TensorsToRecordBatchConverter,
@@ -420,8 +462,8 @@ def _convert_to_record_batch(
         TensorAdapterConfig, dataset_metadata.DatasetMetadata
     ],
     validate_varlen_sparse_values: bool = False,
-) -> Tuple[pa.RecordBatch, Dict[str, pa.Array]]:
-  """Convert batches of ndarrays to pyarrow.RecordBatch."""
+) -> Iterable[Tuple[pa.RecordBatch, Dict[str, pa.Array]]]:
+  """Convert batch of ndarrays to pyarrow.RecordBatches."""
 
   # Making a copy of batch_dict because mutating PCollection elements is not
   # allowed.
@@ -466,9 +508,10 @@ def _convert_to_record_batch(
       arrow_columns.append(data)
     else:
       unary_passthrough_features[key] = data
-
-  return pa.RecordBatch.from_arrays(
-      arrow_columns, schema=arrow_schema), unary_passthrough_features
+  for reccord_batch in _maybe_slice_large_record_batch(
+      pa.RecordBatch.from_arrays(arrow_columns, schema=arrow_schema)
+  ):
+    yield reccord_batch, unary_passthrough_features
 
 
 def _transformed_batch_to_instance_dicts(
@@ -1545,7 +1588,7 @@ def expand(self, dataset_and_transform_fn):
         )
     )
 
-    output_data = output_batches | 'ConvertToRecordBatch' >> beam.Map(
+    output_data = output_batches | 'ConvertToRecordBatch' >> beam.FlatMap(
         _convert_to_record_batch,
         converter=beam.pvalue.AsSingleton(converter_pcol),
         passthrough_keys=Context.get_passthrough_keys(),
diff --git a/tensorflow_transform/beam/impl_output_record_batches_test.py b/tensorflow_transform/beam/impl_output_record_batches_test.py
@@ -23,7 +23,9 @@
 from tensorflow_transform.beam import impl_test
 from tensorflow_transform.beam import tft_unit
 from tensorflow_transform.tf_metadata import schema_utils
-from tfx_bsl.tfxio.tensor_adapter import TensorAdapterConfig
+from tfx_bsl.tfxio import tensor_adapter
+
+_LARGE_BATCH_SIZE = 1 << 10
 
 
 class BeamImplOutputRecordBatchesTest(impl_test.BeamImplTest):
@@ -91,9 +93,11 @@ def testConvertToRecordBatchPassthroughData(self):
         (passthrough_key4, batch_dict[passthrough_key4].type)
     ])
     # Note that we only need `input_metadata.arrow_schema`.
-    input_metadata = TensorAdapterConfig(arrow_schema, {})
-    record_batch, unary_features = impl._convert_to_record_batch(
-        batch_dict, converter, passthrough_keys, input_metadata)
+    input_metadata = tensor_adapter.TensorAdapterConfig(arrow_schema, {})
+    converted = list(impl._convert_to_record_batch(
+        batch_dict, converter, passthrough_keys, input_metadata))
+    self.assertLen(converted, 1)
+    record_batch, unary_features = converted[0]
     expected_record_batch = {
         'a': [[100], [1], [10]],
         passthrough_key1: [[1], None, [0]]
@@ -115,11 +119,84 @@ def testConvertToRecordBatchPassthroughData(self):
                                             pa.large_list(pa.int64()))
     input_metadata.arrow_schema = input_metadata.arrow_schema.append(
         pa.field(passthrough_key5, batch_dict[passthrough_key5].type))
-    with self.assertRaisesRegexp(
-        ValueError, 'Cannot pass-through data when '
-        'input and output batch sizes are different'):
-      _ = impl._convert_to_record_batch(batch_dict, converter, passthrough_keys,
-                                        input_metadata)
+    with self.assertRaisesRegex(
+        ValueError,
+        'Cannot pass-through data when '
+        'input and output batch sizes are different',
+    ):
+      _ = list(
+          impl._convert_to_record_batch(
+              batch_dict, converter, passthrough_keys, input_metadata
+          )
+      )
+
+  @tft_unit.named_parameters(
+      dict(
+          testcase_name='NoPassthroughData',
+          passthrough_data={},
+          expected_unary_features={},
+      ),
+      dict(
+          testcase_name='WithPassthroughData',
+          passthrough_data={
+              '__passthrough_with_batch_length__': pa.array(
+                  [[1]] * _LARGE_BATCH_SIZE, pa.large_list(pa.int64())
+              ),
+              '__passthrough_with_one_value__': pa.array(
+                  [None], pa.large_list(pa.float32())
+              ),
+          },
+          expected_unary_features={
+              '__passthrough_with_one_value__': pa.array(
+                  [None], pa.large_list(pa.float32())
+              ),
+          },
+      ),
+  )
+  def testConvertToLargeRecordBatch(
+      self, passthrough_data, expected_unary_features
+  ):
+    """Tests slicing of large transformed batches during conversion."""
+    # Any Beam test pipeline handling elements this large crashes the program
+    # with OOM (even with 28GB memory available), so we test the conversion
+    # pretty narrowly.
+
+    # 2^31 elements in total.
+    num_values = 1 << 21
+    batch_dict = {
+        'a': np.zeros([_LARGE_BATCH_SIZE, num_values], np.float32),
+        **passthrough_data,
+    }
+    schema = schema_utils.schema_from_feature_spec(
+        {'a': tf.io.FixedLenFeature([num_values], tf.float32)}
+    )
+    converter = impl_helper.make_tensor_to_arrow_converter(schema)
+    arrow_schema = pa.schema(
+        [
+            ('a', pa.large_list(pa.float32())),
+        ]
+        + [(key, value.type) for key, value in passthrough_data.items()]
+    )
+    input_metadata = tensor_adapter.TensorAdapterConfig(arrow_schema, {})
+    actual_num_rows = 0
+    actual_num_batches = 0
+    # Features are either going to be in the `record_batch` or in
+    # `unary_features`.
+    record_batch_features = set(batch_dict.keys()) - set(
+        expected_unary_features.keys()
+    )
+    for record_batch, unary_features in impl._convert_to_record_batch(
+        batch_dict, converter, set(passthrough_data.keys()), input_metadata
+    ):
+      self.assertEqual(set(record_batch.schema.names), record_batch_features)
+      self.assertEqual(unary_features, expected_unary_features)
+      self.assertLessEqual(
+          record_batch.nbytes, impl._MAX_TRANSFORMED_BATCH_BYTES_SIZE
+      )
+      actual_num_rows += record_batch.num_rows
+      actual_num_batches += 1
+    self.assertEqual(actual_num_rows, _LARGE_BATCH_SIZE)
+    self.assertGreater(actual_num_batches, 1)
 
 
 if __name__ == '__main__':