tensorflow
diff --git a/‎RELEASE.md‎
Lines changed: 2 additions & 0 deletions b/‎RELEASE.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensorflow_transform/beam/vocabulary_integration_test.py‎
Lines changed: 105 additions & 81 deletions b/‎tensorflow_transform/beam/vocabulary_integration_test.py‎
Lines changed: 105 additions & 81 deletions
diff --git a/‎tensorflow_transform/experimental/mappers.py‎
Lines changed: 21 additions & 7 deletions b/‎tensorflow_transform/experimental/mappers.py‎
Lines changed: 21 additions & 7 deletions
@@ -13,6 +13,8 @@
 *   `DatasetKey.non_cacheable` added to allow for some datasets to not produce
     cache. This may be useful for gradual cache generation when operating on a
     large rolling range of datasets.
+*   Vocabularies produced by `compute_and_apply_vocabulary` can now store
+    frequencies. Controlled by the `store_frequency` parameter.
 
 ## Bug Fixes and Other Changes
 
 
@@ -31,100 +31,86 @@
     dict(
         testcase_name='sparse',
         input_data=[
-            {
-                'val': ['hello'],
-                'idx0': [0],
-                'idx1': [0]
-            },
-            {
-                'val': ['world'],
-                'idx0': [1],
-                'idx1': [1]
-            },
-            {
-                'val': ['hello', 'goodbye'],
-                'idx0': [0, 1],
-                'idx1': [1, 2]
-            },
+            {'val': ['hello'], 'idx0': [0], 'idx1': [0]},
+            {'val': ['world'], 'idx0': [1], 'idx1': [1]},
+            {'val': ['hello', 'goodbye'], 'idx0': [0, 1], 'idx1': [1, 2]},
             {
                 'val': ['hello', 'goodbye', ' '],
                 'idx0': [0, 1, 1],
-                'idx1': [0, 1, 2]
+                'idx1': [0, 1, 2],
             },
         ],
-        input_metadata=tft.DatasetMetadata.from_feature_spec({
-            'x': tf.io.SparseFeature(['idx0', 'idx1'], 'val', tf.string, [2, 3])
-        }),
-        expected_data=[{
-            'index$sparse_indices_0': [0],
-            'index$sparse_indices_1': [0],
-            'index$sparse_values': [0],
-        }, {
-            'index$sparse_indices_0': [1],
-            'index$sparse_indices_1': [1],
-            'index$sparse_values': [2],
-        }, {
-            'index$sparse_indices_0': [0, 1],
-            'index$sparse_indices_1': [1, 2],
-            'index$sparse_values': [0, 1],
-        }, {
-            'index$sparse_indices_0': [0, 1, 1],
-            'index$sparse_indices_1': [0, 1, 2],
-            'index$sparse_values': [0, 1, 3],
-        }],
-        expected_vocab_file_contents={
-            'my_vocab': [b'hello', b'goodbye', b'world', b' ']
-        }),
-    dict(
-        testcase_name='ragged',
-        input_data=[
+        input_metadata=tft.DatasetMetadata.from_feature_spec(
+            {
+                'x': tf.io.SparseFeature(
+                    ['idx0', 'idx1'], 'val', tf.string, [2, 3]
+                )
+            }
+        ),
+        expected_data=[
             {
-                'val': ['hello', ' '],
-                'row_lengths': [1, 0, 1]
+                'index$sparse_indices_0': [0],
+                'index$sparse_indices_1': [0],
+                'index$sparse_values': [0],
             },
             {
-                'val': ['world'],
-                'row_lengths': [0, 1]
+                'index$sparse_indices_0': [1],
+                'index$sparse_indices_1': [1],
+                'index$sparse_values': [2],
             },
             {
-                'val': ['hello', 'goodbye'],
-                'row_lengths': [2, 0, 0]
+                'index$sparse_indices_0': [0, 1],
+                'index$sparse_indices_1': [1, 2],
+                'index$sparse_values': [0, 1],
             },
             {
-                'val': ['hello', 'goodbye', ' '],
-                'row_lengths': [0, 2, 1]
+                'index$sparse_indices_0': [0, 1, 1],
+                'index$sparse_indices_1': [0, 1, 2],
+                'index$sparse_values': [0, 1, 3],
             },
         ],
-        input_metadata=tft.DatasetMetadata.from_feature_spec({
-            'x':
-                tf.io.RaggedFeature(
+        expected_vocab_contents={
+            b'hello': 3,
+            b'goodbye': 2,
+            b'world': 1,
+            b' ': 1,
+        },
+    ),
+    dict(
+        testcase_name='ragged',
+        input_data=[
+            {'val': ['hello', ' '], 'row_lengths': [1, 0, 1]},
+            {'val': ['world'], 'row_lengths': [0, 1]},
+            {'val': ['hello', 'goodbye'], 'row_lengths': [2, 0, 0]},
+            {'val': ['hello', 'goodbye', ' '], 'row_lengths': [0, 2, 1]},
+        ],
+        input_metadata=tft.DatasetMetadata.from_feature_spec(
+            {
+                'x': tf.io.RaggedFeature(
                     tf.string,
                     value_key='val',
                     partitions=[
                         tf.io.RaggedFeature.RowLengths('row_lengths')  # pytype: disable=attribute-error
-                    ])
-        }),
+                    ],
+                )
+            }
+        ),
         expected_data=[
-            {
-                'index$ragged_values': [0, 2],
-                'index$row_lengths_1': [1, 0, 1]
-            },
-            {
-                'index$ragged_values': [3],
-                'index$row_lengths_1': [0, 1]
-            },
-            {
-                'index$ragged_values': [0, 1],
-                'index$row_lengths_1': [2, 0, 0]
-            },
+            {'index$ragged_values': [0, 2], 'index$row_lengths_1': [1, 0, 1]},
+            {'index$ragged_values': [3], 'index$row_lengths_1': [0, 1]},
+            {'index$ragged_values': [0, 1], 'index$row_lengths_1': [2, 0, 0]},
             {
                 'index$ragged_values': [0, 1, 2],
-                'index$row_lengths_1': [0, 2, 1]
+                'index$row_lengths_1': [0, 2, 1],
             },
         ],
-        expected_vocab_file_contents={
-            'my_vocab': [b'hello', b'goodbye', b' ', b'world']
-        }),
+        expected_vocab_contents={
+            b'hello': 3,
+            b'goodbye': 2,
+            b' ': 2,
+            b'world': 1,
+        },
+    ),
 ]
 
 
@@ -733,7 +719,11 @@ def preprocessing_fn(inputs):
             'my_approximate_vocab': expected_vocab_file_contents
         })
 
-  def testComputeAndApplyApproximateVocabulary(self):
+  @tft_unit.named_parameters([
+      dict(testcase_name='no_frequency', store_frequency=False),
+      dict(testcase_name='with_frequency', store_frequency=True),
+  ])
+  def testComputeAndApplyApproximateVocabulary(self, store_frequency):
     input_data = [{'x': 'a'}] * 2 + [{'x': 'b'}] * 3
     input_metadata = tft.DatasetMetadata.from_feature_spec(
         {'x': tf.io.FixedLenFeature([], tf.string)})
@@ -743,7 +733,9 @@ def preprocessing_fn(inputs):
           inputs['x'],
           top_k=2,
           file_format=self._VocabFormat(),
-          num_oov_buckets=1)
+          store_frequency=store_frequency,
+          num_oov_buckets=1,
+      )
       return {'index': index}
 
     expected_data = [{'index': 1}] * 2 + [{'index': 0}] * 3 + [{'index': 2}]
@@ -1355,19 +1347,49 @@ def preprocessing_fn(inputs):
         expected_metadata,
         expected_vocab_file_contents=expected_vocab_file_contents)
 
-  @tft_unit.named_parameters(*_COMPOSITE_COMPUTE_AND_APPLY_VOCABULARY_TEST_CASES
-                            )
-  def testCompositeComputeAndApplyVocabulary(self, input_data, input_metadata,
-                                             expected_data,
-                                             expected_vocab_file_contents):
-
+  @tft_unit.named_parameters(
+      *tft_unit.cross_named_parameters(
+          _COMPOSITE_COMPUTE_AND_APPLY_VOCABULARY_TEST_CASES,
+          [
+              dict(testcase_name='no_frequency', store_frequency=False),
+              dict(testcase_name='with_frequency', store_frequency=True),
+          ],
+      )
+  )
+  def testCompositeComputeAndApplyVocabulary(
+      self,
+      input_data,
+      input_metadata,
+      expected_data,
+      expected_vocab_contents,
+      store_frequency,
+  ):
     def preprocessing_fn(inputs):
       index = tft.compute_and_apply_vocabulary(
           inputs['x'],
           file_format=self._VocabFormat(),
-          vocab_filename='my_vocab')
+          store_frequency=store_frequency,
+          vocab_filename='my_vocab',
+      )
       return {'index': index}
 
+    if store_frequency:
+      def format_pair(t: bytes, c: int) -> str:
+        t = t.decode('utf-8')
+        if t != ' ' or self._VocabFormat() != 'text':
+          suffix = ' ' + t
+        else:
+          suffix = ' __SPACE__'
+        return f'{c}{suffix}'
+      contents = [
+          format_pair(t, c).encode('utf-8')
+          for t, c in expected_vocab_contents.items()
+      ]
+    else:
+      contents = [t for t in expected_vocab_contents]
+
+    expected_vocab_file_contents = {'my_vocab': contents}
+
     self.assertAnalyzeAndTransformResults(
         input_data,
         input_metadata,
@@ -1650,7 +1672,9 @@ def preprocessing_fn(inputs):
           coverage_top_k=1,
           key_fn=key_fn,
           frequency_threshold=4,
-          file_format=self._VocabFormat())
+          store_frequency=True,
+          file_format=self._VocabFormat(),
+      )
 
       # Return input unchanged, this preprocessing_fn is a no-op except for
       # computing uniques.
 
@@ -45,9 +45,10 @@ def compute_and_apply_approximate_vocabulary(
     num_oov_buckets: int = 0,
     vocab_filename: Optional[str] = None,
     weights: Optional[tf.Tensor] = None,
-    file_format: common_types.VocabularyFileFormatType = analyzers
-    .DEFAULT_VOCABULARY_FILE_FORMAT,
-    name: Optional[str] = None) -> common_types.ConsistentTensorType:
+    file_format: common_types.VocabularyFileFormatType = analyzers.DEFAULT_VOCABULARY_FILE_FORMAT,
+    store_frequency: Optional[bool] = False,
+    name: Optional[str] = None,
+) -> common_types.ConsistentTensorType:
   """Generates an approximate vocabulary for `x` and maps it to an integer.
 
   Args:
@@ -70,7 +71,12 @@ def compute_and_apply_approximate_vocabulary(
       same shape as x.
     file_format: (Optional) A str. The format of the resulting vocabulary file.
       Accepted formats are: 'tfrecord_gzip', 'text'. 'tfrecord_gzip' requires
-        tensorflow>=2.4. The default value is 'text'.
+      tensorflow>=2.4. The default value is 'text'.
+    store_frequency: If True, frequency of the words is stored in the vocabulary
+      file. In the case labels are provided, the mutual information is stored in
+      the file instead. Each line in the file will be of the form 'frequency
+      word'. NOTE: if True and text_format is 'text' then spaces will be
+      replaced to avoid information loss.
     name: (Optional) A name for this operation.
 
   Returns:
@@ -90,19 +96,27 @@ def compute_and_apply_approximate_vocabulary(
   """
   with tf.compat.v1.name_scope(name,
                                'compute_and_apply_approximate_vocabulary'):
+    if store_frequency and file_format == 'text':
+      x = tf_utils.maybe_format_vocabulary_input(x)
     deferred_vocab_and_filename = experimental_analyzers.approximate_vocabulary(
         x=x,
         top_k=top_k,
         vocab_filename=vocab_filename,
         weights=weights,
         file_format=file_format,
-        name=name)
-    return mappers.apply_vocabulary(
+        store_frequency=store_frequency,
+        name=name,
+    )
+    return mappers._apply_vocabulary_internal(  # pylint: disable=protected-access
         x,
         deferred_vocab_and_filename,
         default_value,
         num_oov_buckets,
-        file_format=file_format)
+        lookup_fn=None,
+        file_format=file_format,
+        store_frequency=store_frequency,
+        name=None,
+    )
 
 
 @common.log_api_use(common.MAPPER_COLLECTION)