No public description

tensorflower-gardener · tensorflower-gardener · commit a44092b679d2 · 2023-09-22T12:44:47.000-07:00
PiperOrigin-RevId: 567692565
diff --git a/official/projects/yt8m/configs/yt8m.py b/official/projects/yt8m/configs/yt8m.py
@@ -80,6 +80,7 @@ class DataConfig(cfg.DataConfig):
   sample_random_frames: bool = True
   # Sample random frames if not None. No sampling in inference.
   num_sample_frames: Optional[int] = 300
+  input_per_feature_l2_norm: bool = False
   prefetch_buffer_size: int = 100
   shuffle_buffer_size: int = 100
   num_classes: int = 3862
diff --git a/official/projects/yt8m/dataloaders/yt8m_input.py b/official/projects/yt8m/dataloaders/yt8m_input.py
@@ -157,6 +157,7 @@ def _process_segment_and_label(video_matrix, num_frames, contexts,
   return output_dict
 
 
+# TODO(allenyan, zhengxu): Adds a unit test for this function.
 def _get_video_matrix(features, feature_size, dtype, max_frames,
                       max_quantized_value, min_quantized_value):
   """Decodes features from an input string and quantizes it.
@@ -187,8 +188,16 @@ def _get_video_matrix(features, feature_size, dtype, max_frames,
   return feature_matrix, num_frames
 
 
-def _concat_features(features, feature_names, feature_sizes, feature_dtypes,
-                     max_frames, max_quantized_value, min_quantized_value):
+def _concat_features(
+    features,
+    feature_names,
+    feature_sizes,
+    feature_dtypes,
+    max_frames,
+    max_quantized_value,
+    min_quantized_value,
+    per_feature_l2_norm=False,
+):
   """Loads (potentially) different types of features and concatenates them.
 
   Args:
@@ -199,6 +208,7 @@ def _concat_features(features, feature_names, feature_sizes, feature_dtypes,
       max_frames: number of frames in the sequence
       max_quantized_value: the maximum of the quantized value.
       min_quantized_value: the minimum of the quantized value.
+      per_feature_l2_norm: whether to l2 normalize each feature.
 
   Returns:
       video_matrix: different features concatenated into one matrix
@@ -225,6 +235,8 @@ def _concat_features(features, feature_names, feature_sizes, feature_dtypes,
         min_quantized_value)
     num_common_frames = tf.math.minimum(num_frames_in_this_feature,
                                         num_common_frames)
+    if per_feature_l2_norm:
+      feature_matrix = tf.math.l2_normalize(feature_matrix, axis=-1)
     feature_matrices[i] = feature_matrix
 
   for i in range(num_features):
@@ -347,14 +359,15 @@ def __init__(
     self._num_sample_frames = input_params.num_sample_frames
     self._max_quantized_value = max_quantized_value
     self._min_quantized_value = min_quantized_value
+    self._input_per_feature_l2_norm = input_params.input_per_feature_l2_norm
 
   def _parse_train_data(self, decoded_tensors):
     """Parses data for training."""
     # loads (potentially) different types of features and concatenates them
     video_matrix, num_frames = _concat_features(
         decoded_tensors, self._feature_names, self._feature_sizes,
         self._feature_dtypes, self._max_frames, self._max_quantized_value,
-        self._min_quantized_value)
+        self._min_quantized_value, self._input_per_feature_l2_norm)
     if not self._include_video_id and "id" in decoded_tensors:
       del decoded_tensors["id"]
 
@@ -383,7 +396,7 @@ def _parse_eval_data(self, decoded_tensors):
     video_matrix, num_frames = _concat_features(
         decoded_tensors, self._feature_names, self._feature_sizes,
         self._feature_dtypes, self._max_frames, self._max_quantized_value,
-        self._min_quantized_value)
+        self._min_quantized_value, self._input_per_feature_l2_norm)
     if not self._include_video_id and "id" in decoded_tensors:
       del decoded_tensors["id"]
 
diff --git a/official/projects/yt8m/dataloaders/yt8m_input_test.py b/official/projects/yt8m/dataloaders/yt8m_input_test.py
@@ -160,9 +160,16 @@ def test_read_segment_level_input(self, include_video_id, num_sample_frames):
     if include_video_id:
       self.assertEqual(example['video_ids'].shape.as_list(), [batch_size])
 
-  @parameterized.parameters((True, 4), (False, 4), (False, None))
+  @parameterized.parameters(
+      (True, 4, False),
+      (False, 4, False),
+      (False, None, False),
+      (True, 4, True),
+      (False, 4, True),
+      (False, None, True),
+  )
   def test_read_video_level_float_input(
-      self, include_video_id, num_sample_frames
+      self, include_video_id, num_sample_frames, per_feature_l2_norm
   ):
     data_dir = os.path.join(self.get_temp_dir(), 'data2')
     tf.io.gfile.makedirs(data_dir)
@@ -188,6 +195,7 @@ def test_read_video_level_float_input(
     params.feature_from_bytes = (False, False)
     params.label_field = 'clip/label/index'
     params.include_video_id = include_video_id
+    params.input_per_feature_l2_norm = per_feature_l2_norm
     reader = self.create_input_reader(params)
 
     dataset = reader.read()
@@ -211,6 +219,9 @@ def test_read_video_level_float_input(
         'FEATURE/feature/floats'].feature[0].float_list.value
     expected_labels = examples[0].context.feature[
         params.label_field].int64_list.value
+    if per_feature_l2_norm:
+      expected_feature = tf.math.l2_normalize(expected_feature, axis=-1)
+      expected_context = tf.math.l2_normalize(expected_context, axis=-1)
     self.assertAllEqual(expected_feature,
                         example['video_matrix'][0, 0, params.feature_sizes[0]:])
     self.assertAllEqual(expected_context,