Fixes a tfdv bug caused by slicing on a feature missing from a RecordBatch.

tfx-copybara · tfx-copybara · commit 18a0d9095cf4 · 2021-09-16T12:42:23.000-07:00
PiperOrigin-RevId: 397144053
diff --git a/RELEASE.md b/RELEASE.md
@@ -12,6 +12,8 @@
     large numbers of examples.
 *   Depends on
     `tensorflow>=1.15.2,!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,<3`.
+*   Fixed a bug wherein slicing on a feature missing from some batches could
+    produce slice keys derived from a different feature.
 
 ## Known Issues
 
diff --git a/tensorflow_data_validation/utils/slicing_util.py b/tensorflow_data_validation/utils/slicing_util.py
@@ -123,8 +123,12 @@ def feature_value_slicer(record_batch: pa.RecordBatch) -> Iterable[
     """
     per_feature_parent_indices = []
     for feature_name, values in six.iteritems(features):
-      feature_array = record_batch.column(
-          record_batch.schema.get_field_index(feature_name))
+      idx = record_batch.schema.get_field_index(feature_name)
+      # If the feature name does not appear in the schema for this record batch,
+      # drop it from the set of sliced features.
+      if idx < 0:
+        continue
+      feature_array = record_batch.column(idx)
       flattened, value_parent_indices = arrow_util.flatten_nested(
           feature_array, True)
       non_missing_values = np.asarray(flattened)
@@ -138,7 +142,10 @@ def feature_value_slicer(record_batch: pa.RecordBatch) -> Iterable[
       if values is not None:
         df = df.loc[df[feature_name].isin(values)]
       per_feature_parent_indices.append(df)
-
+    # If there are no features to slice on, yield no output.
+    # TODO(b/200081813): Produce output with an appropriate placeholder key.
+    if not per_feature_parent_indices:
+      return
     # Join dataframes based on parent indices.
     # Note that we want the parent indices per slice key to be sorted in the
     # merged dataframe. The individual dataframes have the parent indices in
diff --git a/tensorflow_data_validation/utils/slicing_util_test.py b/tensorflow_data_validation/utils/slicing_util_test.py
@@ -31,6 +31,7 @@ class SlicingUtilTest(absltest.TestCase):
   def _check_results(self, got, expected):
     got_dict = {g[0]: g[1] for g in got}
     expected_dict = {e[0]: e[1] for e in expected}
+
     self.assertCountEqual(got_dict.keys(), expected_dict.keys())
     for k, got_record_batch in got_dict.items():
       expected_record_batch = expected_dict[k]
@@ -80,6 +81,25 @@ def test_get_feature_value_slicer(self):
         slicing_util.get_feature_value_slicer(features)(input_record_batch),
         expected_result)
 
+  def test_get_feature_value_slicer_one_feature_not_in_batch(self):
+    features = {'not_an_actual_feature': None, 'a': None}
+    input_record_batch = pa.RecordBatch.from_arrays([
+        pa.array([[1], [2, 1]]),
+        pa.array([['dog'], ['cat']]),
+    ], ['a', 'b'])
+    expected_result = [
+        (u'a_1',
+         pa.RecordBatch.from_arrays(
+             [pa.array([[1], [2, 1]]),
+              pa.array([['dog'], ['cat']])], ['a', 'b'])),
+        (u'a_2',
+         pa.RecordBatch.from_arrays(
+             [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b'])),
+    ]
+    self._check_results(
+        slicing_util.get_feature_value_slicer(features)(input_record_batch),
+        expected_result)
+
   def test_get_feature_value_slicer_single_feature(self):
     features = {'a': [2]}
     input_record_batch = pa.RecordBatch.from_arrays([
@@ -118,6 +138,18 @@ def test_get_feature_value_slicer_feature_not_in_record_batch(self):
         slicing_util.get_feature_value_slicer(features)(input_record_batch),
         expected_result)
 
+  def test_get_feature_value_slicer_feature_not_in_record_batch_all_values(
+      self):
+    features = {'c': None}
+    input_record_batch = pa.RecordBatch.from_arrays([
+        pa.array([[1], [2, 1]]),
+        pa.array([['dog'], ['cat']]),
+    ], ['a', 'b'])
+    expected_result = []
+    self._check_results(
+        slicing_util.get_feature_value_slicer(features)(input_record_batch),
+        expected_result)
+
   def test_get_feature_value_slicer_bytes_feature_valid_utf8(self):
     features = {'b': None}
     input_record_batch = pa.RecordBatch.from_arrays([