Add Video classification with 3D CNN tutorial

shilpakancharla · copybara-github · commit bfb373de0307 · 2022-10-20T15:43:56.000-07:00
PiperOrigin-RevId: 482618613
diff --git a/site/en/tutorials/_toc.yaml b/site/en/tutorials/_toc.yaml
@@ -97,7 +97,7 @@ toc:
   - title: "Distributed input"
     path: /tutorials/distribute/input
 
-- title: "Images"
+- title: "Vision"
   style: accordion
   section:
   - title: "Convolutional Neural Network"
@@ -115,6 +115,9 @@ toc:
   - title: "Object detection with TF Hub"
     path: /hub/tutorials/tf2_object_detection
     status: external
+  - title: "Video classification"
+    status: new
+    path: /tutorials/video/video_classification
 
 - title: "Text"
   style: accordion
diff --git a/site/en/tutorials/load_data/video.ipynb b/site/en/tutorials/load_data/video.ipynb
@@ -239,7 +239,7 @@
       "outputs": [],
       "source": [
         "def get_files_per_class(files):\n",
-        "  \"\"\" Retrieve the files that belong to each class. \n",
+        "  \"\"\" Retrieve the files that belong to each class.\n",
         "\n",
         "    Args:\n",
         "      files: List of files in the dataset.\n",
@@ -553,10 +553,34 @@
         "id": "D1vvyT0F7JAZ"
       },
       "source": [
-        "The following function splits the videos into frames, reads a randomly chosen span of `n_frames` out of a video file, and returns them as a NumPy `array`.\n",
+        "The `frames_from_video_file` function splits the videos into frames, reads a randomly chosen span of `n_frames` out of a video file, and returns them as a NumPy `array`.\n",
         "To reduce memory and computation overhead, choose a **small** number of frames. In addition, pick the **same** number of frames from each video, which makes it easier to work on batches of data.\n"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "vNBCiV3bMzpD"
+      },
+      "outputs": [],
+      "source": [
+        "def format_frames(frame, output_size):\n",
+        "  \"\"\"\n",
+        "    Pad and resize an image from a video.\n",
+        "    \n",
+        "    Args:\n",
+        "      frame: Image that needs to resized and padded. \n",
+        "      output_size: Pixel size of the output frame image.\n",
+        "\n",
+        "    Return:\n",
+        "      Formatted frame with padding of specified output size.\n",
+        "  \"\"\"\n",
+        "  frame = tf.image.convert_image_dtype(frame, tf.float32)\n",
+        "  frame = tf.image.resize_with_pad(frame, *output_size)\n",
+        "  return frame"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -565,8 +589,9 @@
       },
       "outputs": [],
       "source": [
-        "def frames_from_video_file(video_path, n_frames, output_size = (224,224)):\n",
-        "  \"\"\" Creates frames from each video file present for each category.\n",
+        "def frames_from_video_file(video_path, n_frames, output_size = (224,224), frame_step = 15):\n",
+        "  \"\"\"\n",
+        "    Creates frames from each video file present for each category.\n",
         "\n",
         "    Args:\n",
         "      video_path: File path to the video.\n",
@@ -576,32 +601,32 @@
         "    Return:\n",
         "      An NumPy array of frames in the shape of (n_frames, height, width, channels).\n",
         "  \"\"\"\n",
-        "  # Read each frame by frame\n",
+        "  # Read each video frame by frame\n",
         "  result = []\n",
         "  src = cv2.VideoCapture(str(video_path))  \n",
         "\n",
         "  video_length = src.get(cv2.CAP_PROP_FRAME_COUNT)\n",
         "\n",
-        "  # If the number of frames wanted is greater than the length of the video, then start from beginning\n",
-        "  if n_frames > video_length:\n",
+        "  need_length = 1 + (n_frames - 1) * frame_step\n",
+        "\n",
+        "  if need_length > video_length:\n",
         "    start = 0\n",
         "  else:\n",
-        "    # Otherwise, start at another random point within the video\n",
-        "    max_start = video_length - n_frames\n",
-        "    start = random.randint(0, max_start)\n",
+        "    max_start = video_length - need_length\n",
+        "    start = random.randint(0, max_start + 1)\n",
         "\n",
         "  src.set(cv2.CAP_PROP_POS_FRAMES, start)\n",
+        "  ret, frame = src.read()\n",
+        "  result.append(format_frames(frame, output_size))\n",
         "\n",
-        "  for _ in range(n_frames):\n",
-        "    ret, frame = src.read()\n",
+        "  for _ in range(n_frames - 1):\n",
         "    if ret:\n",
         "      frame = tf.image.convert_image_dtype(frame, tf.float32)\n",
         "      frame = tf.image.resize_with_pad(frame, *output_size)\n",
         "      result.append(frame)\n",
         "    else:\n",
         "      result.append(np.zeros_like(result[0]))\n",
         "  src.release()\n",
-        "  # Ensure that the color scheme is not inverted\n",
         "  result = np.array(result)[..., [2, 1, 0]]\n",
         "\n",
         "  return result"
diff --git a/site/en/tutorials/video/video_classification.ipynb b/site/en/tutorials/video/video_classification.ipynb