Adds a video classification export module for example. The module is subject to change in the near future.

yeqingli · tensorflower-gardener · commit 0327186b164d · 2021-06-22T15:39:57.000-07:00
PiperOrigin-RevId: 380907598
diff --git a/official/vision/beta/serving/export_saved_model_lib.py b/official/vision/beta/serving/export_saved_model_lib.py
@@ -27,6 +27,7 @@
 from official.vision.beta.serving import detection
 from official.vision.beta.serving import image_classification
 from official.vision.beta.serving import semantic_segmentation
+from official.vision.beta.serving import video_classification
 
 
 def export_inference_graph(
@@ -99,6 +100,13 @@ def export_inference_graph(
           batch_size=batch_size,
           input_image_size=input_image_size,
           num_channels=num_channels)
+    elif isinstance(params.task,
+                    configs.video_classification.VideoClassificationTask):
+      export_module = video_classification.VideoClassificationModule(
+          params=params,
+          batch_size=batch_size,
+          input_image_size=input_image_size,
+          num_channels=num_channels)
     else:
       raise ValueError('Export module not implemented for {} task.'.format(
           type(params.task)))
diff --git a/official/vision/beta/serving/image_classification.py b/official/vision/beta/serving/image_classification.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # Lint as: python3
-"""Detection input and model functions for serving/inference."""
+"""Image classification input and model functions for serving/inference."""
 
 import tensorflow as tf
 
diff --git a/official/vision/beta/serving/video_classification.py b/official/vision/beta/serving/video_classification.py
@@ -0,0 +1,191 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Video classification input and model functions for serving/inference."""
+from typing import Mapping, Dict, Text
+
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import video_input
+from official.vision.beta.serving import export_base
+from official.vision.beta.tasks import video_classification
+
+MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
+STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
+
+
+class VideoClassificationModule(export_base.ExportModule):
+  """Video classification Module."""
+
+  def _build_model(self):
+    input_params = self.params.task.train_data
+    self._num_frames = input_params.feature_shape[0]
+    self._stride = input_params.temporal_stride
+    self._min_resize = input_params.min_image_size
+    self._crop_size = input_params.feature_shape[1]
+
+    self._output_audio = input_params.output_audio
+    task = video_classification.VideoClassificationTask(self.params.task)
+    return task.build_model()
+
+  def _decode_tf_example(self, encoded_inputs: tf.Tensor):
+    sequence_description = {
+        # Each image is a string encoding JPEG.
+        video_input.IMAGE_KEY:
+            tf.io.FixedLenSequenceFeature((), tf.string),
+    }
+    if self._output_audio:
+      sequence_description[self._params.task.validation_data.audio_feature] = (
+          tf.io.VarLenFeature(dtype=tf.float32))
+    _, decoded_tensors = tf.io.parse_single_sequence_example(
+        encoded_inputs, {}, sequence_description)
+    for key, value in decoded_tensors.items():
+      if isinstance(value, tf.SparseTensor):
+        decoded_tensors[key] = tf.sparse.to_dense(value)
+    return decoded_tensors
+
+  def _preprocess_image(self, image):
+    image = video_input.process_image(
+        image=image,
+        is_training=False,
+        num_frames=self._num_frames,
+        stride=self._stride,
+        num_test_clips=1,
+        min_resize=self._min_resize,
+        crop_size=self._crop_size,
+        num_crops=1)
+    image = tf.cast(image, tf.float32)  # Use config.
+    features = {'image': image}
+    return features
+
+  def _preprocess_audio(self, audio):
+    features = {}
+    audio = tf.cast(audio, dtype=tf.float32)  # Use config.
+    audio = video_input.preprocess_ops_3d.sample_sequence(
+        audio, 20, random=False, stride=1)
+    audio = tf.ensure_shape(
+        audio, self._params.task.validation_data.audio_feature_shape)
+    features['audio'] = audio
+    return features
+
+  @tf.function
+  def inference_from_tf_example(
+      self, encoded_inputs: tf.Tensor) -> Mapping[str, tf.Tensor]:
+    with tf.device('cpu:0'):
+      if self._output_audio:
+        inputs = tf.map_fn(
+            self._decode_tf_example, (encoded_inputs),
+            fn_output_signature={
+                video_input.IMAGE_KEY: tf.string,
+                self._params.task.validation_data.audio_feature: tf.float32
+            })
+        return self.serve(inputs['image'], inputs['audio'])
+      else:
+        inputs = tf.map_fn(
+            self._decode_tf_example, (encoded_inputs),
+            fn_output_signature={
+                video_input.IMAGE_KEY: tf.string,
+            })
+        return self.serve(inputs[video_input.IMAGE_KEY], tf.zeros([1, 1]))
+
+  @tf.function
+  def inference_from_image_tensors(
+      self, input_frames: tf.Tensor) -> Mapping[str, tf.Tensor]:
+    return self.serve(input_frames, tf.zeros([1, 1]))
+
+  @tf.function
+  def inference_from_image_audio_tensors(
+      self, input_frames: tf.Tensor,
+      input_audio: tf.Tensor) -> Mapping[str, tf.Tensor]:
+    return self.serve(input_frames, input_audio)
+
+  @tf.function
+  def inference_from_image_bytes(self, inputs: tf.Tensor):
+    raise NotImplementedError(
+        'Video classification do not support image bytes input.')
+
+  def serve(self, input_frames: tf.Tensor, input_audio: tf.Tensor):
+    """Cast image to float and run inference.
+
+    Args:
+      input_frames: uint8 Tensor of shape [batch_size, None, None, 3]
+      input_audio: float32
+
+    Returns:
+      Tensor holding classification output logits.
+    """
+    with tf.device('cpu:0'):
+      inputs = tf.map_fn(
+          self._preprocess_image, (input_frames),
+          fn_output_signature={
+              'image': tf.float32,
+          })
+      if self._output_audio:
+        inputs.update(
+            tf.map_fn(
+                self._preprocess_audio, (input_audio),
+                fn_output_signature={'audio': tf.float32}))
+    logits = self.inference_step(inputs)
+    if self.params.task.train_data.is_multilabel:
+      probs = tf.math.sigmoid(logits)
+    else:
+      probs = tf.nn.softmax(logits)
+    return {'logits': logits, 'probs': probs}
+
+  def get_inference_signatures(self, function_keys: Dict[Text, Text]):
+    """Gets defined function signatures.
+
+    Args:
+      function_keys: A dictionary with keys as the function to create signature
+        for and values as the signature keys when returns.
+
+    Returns:
+      A dictionary with key as signature key and value as concrete functions
+        that can be used for tf.saved_model.save.
+    """
+    signatures = {}
+    for key, def_name in function_keys.items():
+      if key == 'image_tensor':
+        input_signature = tf.TensorSpec(
+            shape=[self._batch_size] + self._input_image_size + [3],
+            dtype=tf.uint8,
+            name='INPUT_FRAMES')
+        signatures[
+            def_name] = self.inference_from_image_tensors.get_concrete_function(
+                input_signature)
+      elif key == 'frames_audio':
+        input_signature = [
+            tf.TensorSpec(
+                shape=[self._batch_size] + self._input_image_size + [3],
+                dtype=tf.uint8,
+                name='INPUT_FRAMES'),
+            tf.TensorSpec(
+                shape=[self._batch_size] +
+                self.params.task.train_data.audio_feature_shape,
+                dtype=tf.float32,
+                name='INPUT_AUDIO')
+        ]
+        signatures[
+            def_name] = self.inference_from_image_audio_tensors.get_concrete_function(
+                input_signature)
+      elif key == 'serve_examples' or key == 'tf_example':
+        input_signature = tf.TensorSpec(
+            shape=[self._batch_size], dtype=tf.string)
+        signatures[
+            def_name] = self.inference_from_tf_example.get_concrete_function(
+                input_signature)
+      else:
+        raise ValueError('Unrecognized `input_type`')
+    return signatures
diff --git a/official/vision/beta/serving/video_classification_test.py b/official/vision/beta/serving/video_classification_test.py
@@ -0,0 +1,114 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+
+# import io
+import os
+import random
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.common import registry_imports  # pylint: disable=unused-import
+from official.core import exp_factory
+from official.vision.beta.dataloaders import tfexample_utils
+from official.vision.beta.serving import video_classification
+
+
+class VideoClassificationTest(tf.test.TestCase, parameterized.TestCase):
+
+  def _get_classification_module(self):
+    params = exp_factory.get_exp_config('video_classification_ucf101')
+    params.task.train_data.feature_shape = (8, 64, 64, 3)
+    params.task.validation_data.feature_shape = (8, 64, 64, 3)
+    params.task.model.backbone.resnet_3d.model_id = 50
+    classification_module = video_classification.VideoClassificationModule(
+        params, batch_size=1, input_image_size=[8, 64, 64])
+    return classification_module
+
+  def _export_from_module(self, module, input_type, save_directory):
+    signatures = module.get_inference_signatures(
+        {input_type: 'serving_default'})
+    tf.saved_model.save(module, save_directory, signatures=signatures)
+
+  def _get_dummy_input(self, input_type, module=None):
+    """Get dummy input for the given input type."""
+
+    if input_type == 'image_tensor':
+      images = np.random.randint(
+          low=0, high=255, size=(1, 8, 64, 64, 3), dtype=np.uint8)
+      # images = np.zeros((1, 8, 64, 64, 3), dtype=np.uint8)
+      return images, images
+    elif input_type == 'tf_example':
+      example = tfexample_utils.make_video_test_example(
+          image_shape=(64, 64, 3),
+          audio_shape=(20, 128),
+          label=random.randint(0, 100)).SerializeToString()
+      images = tf.nest.map_structure(
+          tf.stop_gradient,
+          tf.map_fn(
+              module._decode_tf_example,
+              elems=tf.constant([example]),
+              fn_output_signature={
+                  video_classification.video_input.IMAGE_KEY: tf.string,
+              }))
+      images = images[video_classification.video_input.IMAGE_KEY]
+      return [example], images
+    else:
+      raise ValueError(f'{input_type}')
+
+  @parameterized.parameters(
+      {'input_type': 'image_tensor'},
+      {'input_type': 'tf_example'},
+  )
+  def test_export(self, input_type):
+    tmp_dir = self.get_temp_dir()
+    module = self._get_classification_module()
+
+    self._export_from_module(module, input_type, tmp_dir)
+
+    self.assertTrue(os.path.exists(os.path.join(tmp_dir, 'saved_model.pb')))
+    self.assertTrue(
+        os.path.exists(os.path.join(tmp_dir, 'variables', 'variables.index')))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(tmp_dir, 'variables',
+                         'variables.data-00000-of-00001')))
+
+    imported = tf.saved_model.load(tmp_dir)
+    classification_fn = imported.signatures['serving_default']
+
+    images, images_tensor = self._get_dummy_input(input_type, module)
+    processed_images = tf.nest.map_structure(
+        tf.stop_gradient,
+        tf.map_fn(
+            module._preprocess_image,
+            elems=images_tensor,
+            fn_output_signature={
+                'image': tf.float32,
+            }))
+    expected_logits = module.model(processed_images, training=False)
+    expected_prob = tf.nn.softmax(expected_logits)
+    out = classification_fn(tf.constant(images))
+
+    # The imported model should contain any trackable attrs that the original
+    # model had.
+    self.assertAllClose(out['logits'].numpy(), expected_logits.numpy())
+    self.assertAllClose(out['probs'].numpy(), expected_prob.numpy())
+
+
+if __name__ == '__main__':
+  tf.test.main()