1010 " \n " ,
1111 " **Author:** [Abheesht Sharma](https://github.com/abheesht17/)<br>\n " ,
1212 " **Date created:** 2022/06/01<br>\n " ,
13- " **Last modified:** 2022/06/01 <br>\n " ,
13+ " **Last modified:** 2022/12/21 <br>\n " ,
1414 " **Description:** Text Classification on the IMDb Dataset using `keras_nlp.layers.FNetEncoder` layer."
1515 ]
1616 },
6969 },
7070 {
7171 "cell_type" : " code" ,
72- "execution_count" : 0 ,
72+ "execution_count" : null ,
7373 "metadata" : {
7474 "colab_type" : " code"
7575 },
7676 "outputs" : [],
7777 "source" : [
7878 " import keras_nlp\n " ,
79- " import random\n " ,
8079 " import tensorflow as tf\n " ,
8180 " import os\n " ,
8281 " \n " ,
8382 " from tensorflow import keras\n " ,
84- " from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab\n " ,
8583 " \n " ,
8684 " keras.utils.set_random_seed(42)"
8785 ]
9795 },
9896 {
9997 "cell_type" : " code" ,
100- "execution_count" : 0 ,
98+ "execution_count" : null ,
10199 "metadata" : {
102100 "colab_type" : " code"
103101 },
125123 },
126124 {
127125 "cell_type" : " code" ,
128- "execution_count" : 0 ,
126+ "execution_count" : null ,
129127 "metadata" : {
130128 "colab_type" : " code"
131129 },
147145 },
148146 {
149147 "cell_type" : " code" ,
150- "execution_count" : 0 ,
148+ "execution_count" : null ,
151149 "metadata" : {
152150 "colab_type" : " code"
153151 },
172170 },
173171 {
174172 "cell_type" : " code" ,
175- "execution_count" : 0 ,
173+ "execution_count" : null ,
176174 "metadata" : {
177175 "colab_type" : " code"
178176 },
193191 },
194192 {
195193 "cell_type" : " code" ,
196- "execution_count" : 0 ,
194+ "execution_count" : null ,
197195 "metadata" : {
198196 "colab_type" : " code"
199197 },
227225 },
228226 {
229227 "cell_type" : " code" ,
230- "execution_count" : 0 ,
228+ "execution_count" : null ,
231229 "metadata" : {
232230 "colab_type" : " code"
233231 },
249247 },
250248 {
251249 "cell_type" : " code" ,
252- "execution_count" : 0 ,
250+ "execution_count" : null ,
253251 "metadata" : {
254252 "colab_type" : " code"
255253 },
258256 " for text_batch, label_batch in train_ds.take(1):\n " ,
259257 " for i in range(3):\n " ,
260258 " print(text_batch.numpy()[i])\n " ,
261- " print(label_batch.numpy()[i])\n " ,
262- " "
259+ " print(label_batch.numpy()[i])\n "
263260 ]
264261 },
265262 {
263+ "attachments" : {},
266264 "cell_type" : " markdown" ,
267265 "metadata" : {
268266 "colab_type" : " text"
279277 " training it on a corpus gives us a vocabulary of subwords. A subword tokenizer\n " ,
280278 " is a compromise between word tokenizers (word tokenizers need very large\n " ,
281279 " vocabularies for good coverage of input words), and character tokenizers\n " ,
282- " (characters don't really encode meaning like words do). Luckily, TensorFlow Text \n " ,
283- " makes it very simple to train WordPiece on a corpus as described in \n " ,
284- " [this guide](https://www.tensorflow.org/text/guide/subwords_tokenizer) .\n " ,
280+ " (characters don't really encode meaning like words do). Luckily, KerasNLP \n " ,
281+ " makes it very simple to train WordPiece on a corpus with the \n " ,
282+ " `keras_nlp.tokenizers.compute_word_piece_vocabulary` utility .\n " ,
285283 " \n " ,
286284 " Note: The official implementation of FNet uses the SentencePiece Tokenizer."
287285 ]
288286 },
289287 {
290288 "cell_type" : " code" ,
291- "execution_count" : 0 ,
289+ "execution_count" : null ,
292290 "metadata" : {
293291 "colab_type" : " code"
294292 },
295293 "outputs" : [],
296294 "source" : [
297295 " \n " ,
298296 " def train_word_piece(ds, vocab_size, reserved_tokens):\n " ,
299- " bert_vocab_args = dict(\n " ,
300- " # The target vocabulary size\n " ,
301- " vocab_size=vocab_size,\n " ,
302- " # Reserved tokens that must be included in the vocabulary\n " ,
303- " reserved_tokens=reserved_tokens,\n " ,
304- " # Arguments for `text.BertTokenizer`\n " ,
305- " bert_tokenizer_params={\" lower_case\" : True},\n " ,
306- " )\n " ,
307- " \n " ,
308- " # Extract text samples (remove the labels).\n " ,
309297 " word_piece_ds = ds.unbatch().map(lambda x, y: x)\n " ,
310- " vocab = bert_vocab.bert_vocab_from_dataset(\n " ,
311- " word_piece_ds.batch(1000).prefetch(2), **bert_vocab_args\n " ,
298+ " vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(\n " ,
299+ " word_piece_ds.batch(1000).prefetch(2),\n " ,
300+ " vocabulary_size=vocab_size,\n " ,
301+ " reserved_tokens=reserved_tokens,\n " ,
312302 " )\n " ,
313- " return vocab\n " ,
314- " "
303+ " return vocab\n "
315304 ]
316305 },
317306 {
329318 },
330319 {
331320 "cell_type" : " code" ,
332- "execution_count" : 0 ,
321+ "execution_count" : null ,
333322 "metadata" : {
334323 "colab_type" : " code"
335324 },
351340 },
352341 {
353342 "cell_type" : " code" ,
354- "execution_count" : 0 ,
343+ "execution_count" : null ,
355344 "metadata" : {
356345 "colab_type" : " code"
357346 },
374363 },
375364 {
376365 "cell_type" : " code" ,
377- "execution_count" : 0 ,
366+ "execution_count" : null ,
378367 "metadata" : {
379368 "colab_type" : " code"
380369 },
400389 },
401390 {
402391 "cell_type" : " code" ,
403- "execution_count" : 0 ,
392+ "execution_count" : null ,
404393 "metadata" : {
405394 "colab_type" : " code"
406395 },
411400 " \n " ,
412401 " print(\" Sentence: \" , input_sentence_ex)\n " ,
413402 " print(\" Tokens: \" , input_tokens_ex)\n " ,
414- " print(\" Recovered text after detokenizing: \" , tokenizer.detokenize(input_tokens_ex))\n " ,
415- " "
403+ " print(\" Recovered text after detokenizing: \" , tokenizer.detokenize(input_tokens_ex))\n "
416404 ]
417405 },
418406 {
429417 },
430418 {
431419 "cell_type" : " code" ,
432- "execution_count" : 0 ,
420+ "execution_count" : null ,
433421 "metadata" : {
434422 "colab_type" : " code"
435423 },
476464 },
477465 {
478466 "cell_type" : " code" ,
479- "execution_count" : 0 ,
467+ "execution_count" : null ,
480468 "metadata" : {
481469 "colab_type" : " code"
482470 },
517505 },
518506 {
519507 "cell_type" : " code" ,
520- "execution_count" : 0 ,
508+ "execution_count" : null ,
521509 "metadata" : {
522510 "colab_type" : " code"
523511 },
547535 },
548536 {
549537 "cell_type" : " code" ,
550- "execution_count" : 0 ,
538+ "execution_count" : null ,
551539 "metadata" : {
552540 "colab_type" : " code"
553541 },
554542 "outputs" : [],
555543 "source" : [
556- " fnet_classifier.evaluate(test_ds, batch_size=BATCH_SIZE)\n " ,
557- " "
544+ " fnet_classifier.evaluate(test_ds, batch_size=BATCH_SIZE)\n "
558545 ]
559546 },
560547 {
574561 },
575562 {
576563 "cell_type" : " code" ,
577- "execution_count" : 0 ,
564+ "execution_count" : null ,
578565 "metadata" : {
579566 "colab_type" : " code"
580567 },
633620 },
634621 {
635622 "cell_type" : " code" ,
636- "execution_count" : 0 ,
623+ "execution_count" : null ,
637624 "metadata" : {
638625 "colab_type" : " code"
639626 },
691678 },
692679 "nbformat" : 4 ,
693680 "nbformat_minor" : 0
694- }
681+ }
0 commit comments