Accounting for the empty vocabulary dummy token in VocabularyCount. This fixes an issue where get_vocabulary_size_by_name returns 0 for an empty vocabulary, while the actual vocabulary size in this case is 1.

zoyahav · tfx-copybara · commit 1e20b90db7ff · 2023-04-11T03:44:27.000-07:00
PiperOrigin-RevId: 523356276
diff --git a/RELEASE.md b/RELEASE.md
@@ -19,6 +19,7 @@
 *   Depends on `numpy~=1.22.0`.
 *   Depends on `tensorflow>=2.12.0,<2.13`.
 *   Depends on `protobuf>=3.20.3,<5`.
+*   Modifies `get_vocabulary_size_by_name` to return a minimum of 1.
 
 ## Breaking Changes
 
diff --git a/tensorflow_transform/beam/analyzer_impls.py b/tensorflow_transform/beam/analyzer_impls.py
@@ -302,12 +302,18 @@ class _VocabularyCountImpl(beam.PTransform):
   def __init__(self, operation, extra_args):
     super().__init__()
 
+  def _format_count(self, count):
+    # Count should be at least one because empty vocabularies get populated with
+    # a single dummy value when written.
+    # TODO(b/62272023) remove this workaround if/when fixed on tensorflow.
+    return np.int64(np.maximum(count, 1))
+
   def expand(self, inputs):
     pcoll, = inputs
 
     return (pcoll
             | 'TotalVocabSize' >> beam.combiners.Count.Globally()
-            | 'ToInt64' >> beam.Map(np.int64))
+            | 'FormatCount' >> beam.Map(self._format_count))
 
 
 @common.register_ptransform(analyzer_nodes.VocabularyMerge)
diff --git a/tensorflow_transform/beam/annotators_test.py b/tensorflow_transform/beam/annotators_test.py
@@ -216,6 +216,48 @@ def preprocessing_fn(inputs):
           force_tf_compat_v1=use_tf_compat_v1,
       )
 
+  @tft_unit.named_parameters(
+      dict(
+          testcase_name='sanity',
+          values=['hello', 'world', 'world'],
+          expected_size=2,
+      ),
+      dict(
+          testcase_name='single_token',
+          values=['hello', 'hello', 'hello'],
+          expected_size=1,
+      ),
+      dict(
+          testcase_name='empty',
+          values=['', '', ''],
+          expected_size=1,
+      ),
+  )
+  def test_get_vocabulary_size_by_name(self, values, expected_size):
+    vocab_filename = 'vocab'
+
+    def preprocessing_fn(inputs):
+      tft.vocabulary(inputs['s'], vocab_filename=vocab_filename)
+      size = tf.zeros_like(
+          inputs['s'], dtype=tf.int64
+      ) + tft.experimental.get_vocabulary_size_by_name(vocab_filename)
+      return {'size': size}
+
+    input_data_dicts = [dict(s=v) for v in values]
+    input_metadata = tft.DatasetMetadata.from_feature_spec({
+        's': tf.io.FixedLenFeature([], tf.string),
+    })
+    expected_data = [{
+        'size': expected_size,
+    }] * len(values)
+    self.assertAnalyzeAndTransformResults(
+        input_data_dicts,
+        input_metadata,
+        preprocessing_fn,
+        force_tf_compat_v1=False,
+        expected_data=expected_data,
+    )
+
 
 if __name__ == '__main__':
   tft_unit.main()