keras-team
diff --git a/‎examples/nlp/fnet_classification_with_keras_nlp.py‎
Lines changed: 8 additions & 18 deletions b/‎examples/nlp/fnet_classification_with_keras_nlp.py‎
Lines changed: 8 additions & 18 deletions
diff --git a/‎examples/nlp/ipynb/fnet_classification_with_keras_nlp.ipynb‎
Lines changed: 33 additions & 46 deletions b/‎examples/nlp/ipynb/fnet_classification_with_keras_nlp.ipynb‎
Lines changed: 33 additions & 46 deletions
@@ -2,7 +2,7 @@
 Title: Text Classification using FNet
 Author: [Abheesht Sharma](https://github.com/abheesht17/)
 Date created: 2022/06/01
-Last modified: 2022/06/01
+Last modified: 2022/12/21
 Description: Text Classification on the IMDb Dataset using `keras_nlp.layers.FNetEncoder` layer.
 Accelerator: GPU
 """
@@ -51,12 +51,10 @@
 """
 
 import keras_nlp
-import random
 import tensorflow as tf
 import os
 
 from tensorflow import keras
-from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
 
 keras.utils.set_random_seed(42)
 
@@ -151,28 +149,20 @@
 training it on a corpus gives us a vocabulary of subwords. A subword tokenizer
 is a compromise between word tokenizers (word tokenizers need very large
 vocabularies for good coverage of input words), and character tokenizers
-(characters don't really encode meaning like words do). Luckily, TensorFlow Text
-makes it very simple to train WordPiece on a corpus as described in
-[this guide](https://www.tensorflow.org/text/guide/subwords_tokenizer).
+(characters don't really encode meaning like words do). Luckily, KerasNLP
+makes it very simple to train WordPiece on a corpus with the 
+`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.
 
 Note: The official implementation of FNet uses the SentencePiece Tokenizer.
 """
 
 
 def train_word_piece(ds, vocab_size, reserved_tokens):
-    bert_vocab_args = dict(
-        # The target vocabulary size
-        vocab_size=vocab_size,
-        # Reserved tokens that must be included in the vocabulary
-        reserved_tokens=reserved_tokens,
-        # Arguments for `text.BertTokenizer`
-        bert_tokenizer_params={"lower_case": True},
-    )
-
-    # Extract text samples (remove the labels).
     word_piece_ds = ds.unbatch().map(lambda x, y: x)
-    vocab = bert_vocab.bert_vocab_from_dataset(
-        word_piece_ds.batch(1000).prefetch(2), **bert_vocab_args
+    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+        word_piece_ds.batch(1000).prefetch(2),
+        vocabulary_size=vocab_size,
+        reserved_tokens=reserved_tokens,
     )
     return vocab
 
 
@@ -10,7 +10,7 @@
     "\n",
     "**Author:** [Abheesht Sharma](https://github.com/abheesht17/)<br>\n",
     "**Date created:** 2022/06/01<br>\n",
-    "**Last modified:** 2022/06/01<br>\n",
+    "**Last modified:** 2022/12/21<br>\n",
     "**Description:** Text Classification on the IMDb Dataset using `keras_nlp.layers.FNetEncoder` layer."
    ]
   },
@@ -69,19 +69,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
    "outputs": [],
    "source": [
     "import keras_nlp\n",
-    "import random\n",
     "import tensorflow as tf\n",
     "import os\n",
     "\n",
     "from tensorflow import keras\n",
-    "from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab\n",
     "\n",
     "keras.utils.set_random_seed(42)"
    ]
@@ -97,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -125,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -147,7 +145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -172,7 +170,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -193,7 +191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -227,7 +225,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -249,7 +247,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -258,11 +256,11 @@
     "for text_batch, label_batch in train_ds.take(1):\n",
     "    for i in range(3):\n",
     "        print(text_batch.numpy()[i])\n",
-    "        print(label_batch.numpy()[i])\n",
-    ""
+    "        print(label_batch.numpy()[i])\n"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "colab_type": "text"
@@ -279,39 +277,30 @@
     "training it on a corpus gives us a vocabulary of subwords. A subword tokenizer\n",
     "is a compromise between word tokenizers (word tokenizers need very large\n",
     "vocabularies for good coverage of input words), and character tokenizers\n",
-    "(characters don't really encode meaning like words do). Luckily, TensorFlow Text\n",
-    "makes it very simple to train WordPiece on a corpus as described in\n",
-    "[this guide](https://www.tensorflow.org/text/guide/subwords_tokenizer).\n",
+    "(characters don't really encode meaning like words do). Luckily, KerasNLP\n",
+    "makes it very simple to train WordPiece on a corpus with the \n",
+    "`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.\n",
     "\n",
     "Note: The official implementation of FNet uses the SentencePiece Tokenizer."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
    "outputs": [],
    "source": [
     "\n",
     "def train_word_piece(ds, vocab_size, reserved_tokens):\n",
-    "    bert_vocab_args = dict(\n",
-    "        # The target vocabulary size\n",
-    "        vocab_size=vocab_size,\n",
-    "        # Reserved tokens that must be included in the vocabulary\n",
-    "        reserved_tokens=reserved_tokens,\n",
-    "        # Arguments for `text.BertTokenizer`\n",
-    "        bert_tokenizer_params={\"lower_case\": True},\n",
-    "    )\n",
-    "\n",
-    "    # Extract text samples (remove the labels).\n",
     "    word_piece_ds = ds.unbatch().map(lambda x, y: x)\n",
-    "    vocab = bert_vocab.bert_vocab_from_dataset(\n",
-    "        word_piece_ds.batch(1000).prefetch(2), **bert_vocab_args\n",
+    "    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(\n",
+    "        word_piece_ds.batch(1000).prefetch(2),\n",
+    "        vocabulary_size=vocab_size,\n",
+    "        reserved_tokens=reserved_tokens,\n",
     "    )\n",
-    "    return vocab\n",
-    ""
+    "    return vocab\n"
    ]
   },
   {
@@ -329,7 +318,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -351,7 +340,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -374,7 +363,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -400,7 +389,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -411,8 +400,7 @@
     "\n",
     "print(\"Sentence: \", input_sentence_ex)\n",
     "print(\"Tokens: \", input_tokens_ex)\n",
-    "print(\"Recovered text after detokenizing: \", tokenizer.detokenize(input_tokens_ex))\n",
-    ""
+    "print(\"Recovered text after detokenizing: \", tokenizer.detokenize(input_tokens_ex))\n"
    ]
   },
   {
@@ -429,7 +417,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -476,7 +464,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -517,7 +505,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -547,14 +535,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
    "outputs": [],
    "source": [
-    "fnet_classifier.evaluate(test_ds, batch_size=BATCH_SIZE)\n",
-    ""
+    "fnet_classifier.evaluate(test_ds, batch_size=BATCH_SIZE)\n"
    ]
   },
   {
@@ -574,7 +561,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -633,7 +620,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab_type": "code"
    },
@@ -691,4 +678,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}