Update image captioning guide to use TextVectorization

tensorflower-gardener · copybara-github · commit f934d007f9af · 2021-12-13T18:04:55.000-08:00
PiperOrigin-RevId: 416179547
diff --git a/site/en/tutorials/text/image_captioning.ipynb b/site/en/tutorials/text/image_captioning.ipynb
@@ -261,7 +261,7 @@
         "def load_image(image_path):\n",
         "    img = tf.io.read_file(image_path)\n",
         "    img = tf.io.decode_jpeg(img, channels=3)\n",
-        "    img = tf.image.resize(img, (299, 299))\n",
+        "    img = tf.keras.layers.Resizing(299, 299)(img)\n",
         "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
         "    return img, image_path"
       ]
@@ -361,23 +361,11 @@
       "source": [
         "## Preprocess and tokenize the captions\n",
         "\n",
-        "* First, you'll tokenize the captions (for example, by splitting on spaces). This gives us a  vocabulary of all of the unique words in the data (for example, \"surfing\", \"football\", and so on).\n",
-        "* Next, you'll limit the vocabulary size to the top 5,000 words (to save memory). You'll replace all other words with the token \"UNK\" (unknown).\n",
-        "* You then create word-to-index and index-to-word mappings.\n",
-        "* Finally, you pad all sequences to be the same length as the longest one."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "HZfK8RhQRPFj"
-      },
-      "outputs": [],
-      "source": [
-        "# Find the maximum length of any caption in the dataset\n",
-        "def calc_max_length(tensor):\n",
-        "    return max(len(t) for t in tensor)"
+        "You will transform the text captions into integer sequences using the [TextVectorization](https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization) layer, with the following steps:\n",
+        "\n",
+        "* Use [adapt](https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization#adapt) to iterate over all captions, split the captions into words, and compute a vocabulary of the top 5,000 words (to save memory).\n",
+        "* Tokenize all captions by mapping each word to it's index in the vocabulary. All output sequences will be padded to length 50.\n",
+        "* Create word-to-index and index-to-word mappings to display results."
       ]
     },
     {
@@ -388,61 +376,55 @@
       },
       "outputs": [],
       "source": [
-        "# Choose the top 5000 words from the vocabulary\n",
-        "top_k = 5000\n",
-        "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,\n",
-        "                                                  oov_token=\"<unk>\",\n",
-        "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~')\n",
-        "tokenizer.fit_on_texts(train_captions)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "8Q44tNQVRPFt"
-      },
-      "outputs": [],
-      "source": [
-        "tokenizer.word_index['<pad>'] = 0\n",
-        "tokenizer.index_word[0] = '<pad>'"
+        "caption_dataset = tf.data.Dataset.from_tensor_slices(train_captions)\n",
+        "\n",
+        "# We will override the default standardization of TextVectorization to preserve\n",
+        "# \"<>\" characters, so we preserve the tokens for the <start> and <end>.\n",
+        "def standardize(inputs):\n",
+        "  inputs = tf.strings.lower(inputs)\n",
+        "  return tf.strings.regex_replace(inputs,\n",
+        "                                  r\"!\\\"#$%&\\(\\)\\*\\+.,-/:;=?@\\[\\\\\\]^_`{|}~\", \"\")\n",
+        "\n",
+        "# Max word count for a caption.\n",
+        "max_length = 50\n",
+        "# Use the top 5000 words for a vocabulary.\n",
+        "vocabulary_size = 5000\n",
+        "tokenizer = tf.keras.layers.TextVectorization(\n",
+        "    max_tokens=vocabulary_size,\n",
+        "    standardize=standardize,\n",
+        "    output_sequence_length=max_length)\n",
+        "# Learn the vocabulary from the caption data.\n",
+        "tokenizer.adapt(caption_dataset)"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "0fpJb5ojRPFv"
+        "id": "Uaq07VVEu36f"
       },
       "outputs": [],
       "source": [
         "# Create the tokenized vectors\n",
-        "train_seqs = tokenizer.texts_to_sequences(train_captions)"
+        "cap_vector = caption_dataset.map(lambda x: tokenizer(x))"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "AidglIZVRPF4"
-      },
-      "outputs": [],
-      "source": [
-        "# Pad each vector to the max_length of the captions\n",
-        "# If you do not provide a max_length value, pad_sequences calculates it automatically\n",
-        "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gL0wkttkRPGA"
+        "id": "8Q44tNQVRPFt"
       },
       "outputs": [],
       "source": [
-        "# Calculates the max_length, which is used to store the attention weights\n",
-        "max_length = calc_max_length(train_seqs)"
+        "# Create mappings for words to indices and indicies to words.\n",
+        "word_to_index = tf.keras.layers.StringLookup(\n",
+        "    mask_token=\"\",\n",
+        "    vocabulary=tokenizer.get_vocabulary())\n",
+        "index_to_word = tf.keras.layers.StringLookup(\n",
+        "    mask_token=\"\",\n",
+        "    vocabulary=tokenizer.get_vocabulary(),\n",
+        "    invert=True)"
       ]
     },
     {
@@ -531,7 +513,6 @@
         "BUFFER_SIZE = 1000\n",
         "embedding_dim = 256\n",
         "units = 512\n",
-        "vocab_size = top_k + 1\n",
         "num_steps = len(img_name_train) // BATCH_SIZE\n",
         "# Shape of the vector extracted from InceptionV3 is (64, 2048)\n",
         "# These two variables represent that vector shape\n",
@@ -565,7 +546,7 @@
         "\n",
         "# Use map to load the numpy files in parallel\n",
         "dataset = dataset.map(lambda item1, item2: tf.numpy_function(\n",
-        "          map_func, [item1, item2], [tf.float32, tf.int32]),\n",
+        "          map_func, [item1, item2], [tf.float32, tf.int64]),\n",
         "          num_parallel_calls=tf.data.AUTOTUNE)\n",
         "\n",
         "# Shuffle and batch\n",
@@ -713,7 +694,7 @@
       "outputs": [],
       "source": [
         "encoder = CNN_Encoder(embedding_dim)\n",
-        "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
+        "decoder = RNN_Decoder(embedding_dim, units, tokenizer.vocabulary_size())"
       ]
     },
     {
@@ -824,7 +805,7 @@
         "  # because the captions are not related from image to image\n",
         "  hidden = decoder.reset_state(batch_size=target.shape[0])\n",
         "\n",
-        "  dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)\n",
+        "  dec_input = tf.expand_dims([word_to_index('<start>')] * target.shape[0], 1)\n",
         "\n",
         "  with tf.GradientTape() as tape:\n",
         "      features = encoder(img_tensor)\n",
@@ -929,7 +910,7 @@
         "\n",
         "    features = encoder(img_tensor_val)\n",
         "\n",
-        "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
+        "    dec_input = tf.expand_dims([word_to_index('<start>')], 0)\n",
         "    result = []\n",
         "\n",
         "    for i in range(max_length):\n",
@@ -940,9 +921,10 @@
         "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
         "\n",
         "        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()\n",
-        "        result.append(tokenizer.index_word[predicted_id])\n",
+        "        predicted_word = tf.compat.as_text(index_to_word(predicted_id).numpy())\n",
+        "        result.append(predicted_word)\n",
         "\n",
-        "        if tokenizer.index_word[predicted_id] == '<end>':\n",
+        "        if predicted_word == '<end>':\n",
         "            return result, attention_plot\n",
         "\n",
         "        dec_input = tf.expand_dims([predicted_id], 0)\n",
@@ -967,7 +949,7 @@
         "    len_result = len(result)\n",
         "    for i in range(len_result):\n",
         "        temp_att = np.resize(attention_plot[i], (8, 8))\n",
-        "        grid_size = max(np.ceil(len_result/2), 2)\n",
+        "        grid_size = max(int(np.ceil(len_result/2)), 2)\n",
         "        ax = fig.add_subplot(grid_size, grid_size, i+1)\n",
         "        ax.set_title(result[i])\n",
         "        img = ax.imshow(temp_image)\n",
@@ -988,8 +970,8 @@
         "# captions on the validation set\n",
         "rid = np.random.randint(0, len(img_name_val))\n",
         "image = img_name_val[rid]\n",
-        "real_caption = ' '.join([tokenizer.index_word[i]\n",
-        "                        for i in cap_val[rid] if i not in [0]])\n",
+        "real_caption = ' '.join([tf.compat.as_text(index_to_word(i).numpy())\n",
+        "                         for i in cap_val[rid] if i not in [0]])\n",
         "result, attention_plot = evaluate(image)\n",
         "\n",
         "print('Real Caption:', real_caption)\n",
@@ -1044,6 +1026,7 @@
     "colab": {
       "collapsed_sections": [],
       "name": "image_captioning.ipynb",
+      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {