Skip to content

Commit 85216e9

Browse files
chenmoneygithubchenmoneygithub
andauthored
Compatibility change for KerasNLP 0.4 release (#1169)
* Get keras-io guide compatible with kerasNLP 0.4 release * run the colab * rewording * address comments Co-authored-by: chenmoneygithub <[email protected]>
1 parent c7f11c7 commit 85216e9

9 files changed

+381
-545
lines changed

examples/nlp/fnet_classification_with_keras_nlp.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Title: Text Classification using FNet
33
Author: [Abheesht Sharma](https://github.com/abheesht17/)
44
Date created: 2022/06/01
5-
Last modified: 2022/06/01
5+
Last modified: 2022/12/21
66
Description: Text Classification on the IMDb Dataset using `keras_nlp.layers.FNetEncoder` layer.
77
Accelerator: GPU
88
"""
@@ -51,12 +51,10 @@
5151
"""
5252

5353
import keras_nlp
54-
import random
5554
import tensorflow as tf
5655
import os
5756

5857
from tensorflow import keras
59-
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
6058

6159
keras.utils.set_random_seed(42)
6260

@@ -151,28 +149,20 @@
151149
training it on a corpus gives us a vocabulary of subwords. A subword tokenizer
152150
is a compromise between word tokenizers (word tokenizers need very large
153151
vocabularies for good coverage of input words), and character tokenizers
154-
(characters don't really encode meaning like words do). Luckily, TensorFlow Text
155-
makes it very simple to train WordPiece on a corpus as described in
156-
[this guide](https://www.tensorflow.org/text/guide/subwords_tokenizer).
152+
(characters don't really encode meaning like words do). Luckily, KerasNLP
153+
makes it very simple to train WordPiece on a corpus with the
154+
`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.
157155
158156
Note: The official implementation of FNet uses the SentencePiece Tokenizer.
159157
"""
160158

161159

162160
def train_word_piece(ds, vocab_size, reserved_tokens):
163-
bert_vocab_args = dict(
164-
# The target vocabulary size
165-
vocab_size=vocab_size,
166-
# Reserved tokens that must be included in the vocabulary
167-
reserved_tokens=reserved_tokens,
168-
# Arguments for `text.BertTokenizer`
169-
bert_tokenizer_params={"lower_case": True},
170-
)
171-
172-
# Extract text samples (remove the labels).
173161
word_piece_ds = ds.unbatch().map(lambda x, y: x)
174-
vocab = bert_vocab.bert_vocab_from_dataset(
175-
word_piece_ds.batch(1000).prefetch(2), **bert_vocab_args
162+
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
163+
word_piece_ds.batch(1000).prefetch(2),
164+
vocabulary_size=vocab_size,
165+
reserved_tokens=reserved_tokens,
176166
)
177167
return vocab
178168

examples/nlp/ipynb/fnet_classification_with_keras_nlp.ipynb

Lines changed: 33 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"\n",
1111
"**Author:** [Abheesht Sharma](https://github.com/abheesht17/)<br>\n",
1212
"**Date created:** 2022/06/01<br>\n",
13-
"**Last modified:** 2022/06/01<br>\n",
13+
"**Last modified:** 2022/12/21<br>\n",
1414
"**Description:** Text Classification on the IMDb Dataset using `keras_nlp.layers.FNetEncoder` layer."
1515
]
1616
},
@@ -69,19 +69,17 @@
6969
},
7070
{
7171
"cell_type": "code",
72-
"execution_count": 0,
72+
"execution_count": null,
7373
"metadata": {
7474
"colab_type": "code"
7575
},
7676
"outputs": [],
7777
"source": [
7878
"import keras_nlp\n",
79-
"import random\n",
8079
"import tensorflow as tf\n",
8180
"import os\n",
8281
"\n",
8382
"from tensorflow import keras\n",
84-
"from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab\n",
8583
"\n",
8684
"keras.utils.set_random_seed(42)"
8785
]
@@ -97,7 +95,7 @@
9795
},
9896
{
9997
"cell_type": "code",
100-
"execution_count": 0,
98+
"execution_count": null,
10199
"metadata": {
102100
"colab_type": "code"
103101
},
@@ -125,7 +123,7 @@
125123
},
126124
{
127125
"cell_type": "code",
128-
"execution_count": 0,
126+
"execution_count": null,
129127
"metadata": {
130128
"colab_type": "code"
131129
},
@@ -147,7 +145,7 @@
147145
},
148146
{
149147
"cell_type": "code",
150-
"execution_count": 0,
148+
"execution_count": null,
151149
"metadata": {
152150
"colab_type": "code"
153151
},
@@ -172,7 +170,7 @@
172170
},
173171
{
174172
"cell_type": "code",
175-
"execution_count": 0,
173+
"execution_count": null,
176174
"metadata": {
177175
"colab_type": "code"
178176
},
@@ -193,7 +191,7 @@
193191
},
194192
{
195193
"cell_type": "code",
196-
"execution_count": 0,
194+
"execution_count": null,
197195
"metadata": {
198196
"colab_type": "code"
199197
},
@@ -227,7 +225,7 @@
227225
},
228226
{
229227
"cell_type": "code",
230-
"execution_count": 0,
228+
"execution_count": null,
231229
"metadata": {
232230
"colab_type": "code"
233231
},
@@ -249,7 +247,7 @@
249247
},
250248
{
251249
"cell_type": "code",
252-
"execution_count": 0,
250+
"execution_count": null,
253251
"metadata": {
254252
"colab_type": "code"
255253
},
@@ -258,11 +256,11 @@
258256
"for text_batch, label_batch in train_ds.take(1):\n",
259257
" for i in range(3):\n",
260258
" print(text_batch.numpy()[i])\n",
261-
" print(label_batch.numpy()[i])\n",
262-
""
259+
" print(label_batch.numpy()[i])\n"
263260
]
264261
},
265262
{
263+
"attachments": {},
266264
"cell_type": "markdown",
267265
"metadata": {
268266
"colab_type": "text"
@@ -279,39 +277,30 @@
279277
"training it on a corpus gives us a vocabulary of subwords. A subword tokenizer\n",
280278
"is a compromise between word tokenizers (word tokenizers need very large\n",
281279
"vocabularies for good coverage of input words), and character tokenizers\n",
282-
"(characters don't really encode meaning like words do). Luckily, TensorFlow Text\n",
283-
"makes it very simple to train WordPiece on a corpus as described in\n",
284-
"[this guide](https://www.tensorflow.org/text/guide/subwords_tokenizer).\n",
280+
"(characters don't really encode meaning like words do). Luckily, KerasNLP\n",
281+
"makes it very simple to train WordPiece on a corpus with the \n",
282+
"`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.\n",
285283
"\n",
286284
"Note: The official implementation of FNet uses the SentencePiece Tokenizer."
287285
]
288286
},
289287
{
290288
"cell_type": "code",
291-
"execution_count": 0,
289+
"execution_count": null,
292290
"metadata": {
293291
"colab_type": "code"
294292
},
295293
"outputs": [],
296294
"source": [
297295
"\n",
298296
"def train_word_piece(ds, vocab_size, reserved_tokens):\n",
299-
" bert_vocab_args = dict(\n",
300-
" # The target vocabulary size\n",
301-
" vocab_size=vocab_size,\n",
302-
" # Reserved tokens that must be included in the vocabulary\n",
303-
" reserved_tokens=reserved_tokens,\n",
304-
" # Arguments for `text.BertTokenizer`\n",
305-
" bert_tokenizer_params={\"lower_case\": True},\n",
306-
" )\n",
307-
"\n",
308-
" # Extract text samples (remove the labels).\n",
309297
" word_piece_ds = ds.unbatch().map(lambda x, y: x)\n",
310-
" vocab = bert_vocab.bert_vocab_from_dataset(\n",
311-
" word_piece_ds.batch(1000).prefetch(2), **bert_vocab_args\n",
298+
" vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(\n",
299+
" word_piece_ds.batch(1000).prefetch(2),\n",
300+
" vocabulary_size=vocab_size,\n",
301+
" reserved_tokens=reserved_tokens,\n",
312302
" )\n",
313-
" return vocab\n",
314-
""
303+
" return vocab\n"
315304
]
316305
},
317306
{
@@ -329,7 +318,7 @@
329318
},
330319
{
331320
"cell_type": "code",
332-
"execution_count": 0,
321+
"execution_count": null,
333322
"metadata": {
334323
"colab_type": "code"
335324
},
@@ -351,7 +340,7 @@
351340
},
352341
{
353342
"cell_type": "code",
354-
"execution_count": 0,
343+
"execution_count": null,
355344
"metadata": {
356345
"colab_type": "code"
357346
},
@@ -374,7 +363,7 @@
374363
},
375364
{
376365
"cell_type": "code",
377-
"execution_count": 0,
366+
"execution_count": null,
378367
"metadata": {
379368
"colab_type": "code"
380369
},
@@ -400,7 +389,7 @@
400389
},
401390
{
402391
"cell_type": "code",
403-
"execution_count": 0,
392+
"execution_count": null,
404393
"metadata": {
405394
"colab_type": "code"
406395
},
@@ -411,8 +400,7 @@
411400
"\n",
412401
"print(\"Sentence: \", input_sentence_ex)\n",
413402
"print(\"Tokens: \", input_tokens_ex)\n",
414-
"print(\"Recovered text after detokenizing: \", tokenizer.detokenize(input_tokens_ex))\n",
415-
""
403+
"print(\"Recovered text after detokenizing: \", tokenizer.detokenize(input_tokens_ex))\n"
416404
]
417405
},
418406
{
@@ -429,7 +417,7 @@
429417
},
430418
{
431419
"cell_type": "code",
432-
"execution_count": 0,
420+
"execution_count": null,
433421
"metadata": {
434422
"colab_type": "code"
435423
},
@@ -476,7 +464,7 @@
476464
},
477465
{
478466
"cell_type": "code",
479-
"execution_count": 0,
467+
"execution_count": null,
480468
"metadata": {
481469
"colab_type": "code"
482470
},
@@ -517,7 +505,7 @@
517505
},
518506
{
519507
"cell_type": "code",
520-
"execution_count": 0,
508+
"execution_count": null,
521509
"metadata": {
522510
"colab_type": "code"
523511
},
@@ -547,14 +535,13 @@
547535
},
548536
{
549537
"cell_type": "code",
550-
"execution_count": 0,
538+
"execution_count": null,
551539
"metadata": {
552540
"colab_type": "code"
553541
},
554542
"outputs": [],
555543
"source": [
556-
"fnet_classifier.evaluate(test_ds, batch_size=BATCH_SIZE)\n",
557-
""
544+
"fnet_classifier.evaluate(test_ds, batch_size=BATCH_SIZE)\n"
558545
]
559546
},
560547
{
@@ -574,7 +561,7 @@
574561
},
575562
{
576563
"cell_type": "code",
577-
"execution_count": 0,
564+
"execution_count": null,
578565
"metadata": {
579566
"colab_type": "code"
580567
},
@@ -633,7 +620,7 @@
633620
},
634621
{
635622
"cell_type": "code",
636-
"execution_count": 0,
623+
"execution_count": null,
637624
"metadata": {
638625
"colab_type": "code"
639626
},
@@ -691,4 +678,4 @@
691678
},
692679
"nbformat": 4,
693680
"nbformat_minor": 0
694-
}
681+
}

0 commit comments

Comments
 (0)