Skip to content

Commit fca13e8

Browse files
adhadsemattdangerw
andauthored
Fixed no oov token error in vocab for WordPieceTokenizer (#136)
* Fixed no oov token error in vocab for WordPieceTokenizer * Raise no oov_token error during explicit checking for WordPieceTokenizer * Edits * Fix Co-authored-by: Matt Watson <[email protected]>
1 parent beb3f6a commit fca13e8

File tree

2 files changed

+33
-0
lines changed

2 files changed

+33
-0
lines changed

keras_nlp/tokenizers/word_piece_tokenizer.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ def __init__(
198198
"Vocabulary must be an file path or list of terms. "
199199
f"Received: vocabulary={vocabulary}"
200200
)
201+
if oov_token is None:
202+
raise ValueError("`oov_token` cannot be None.")
201203

202204
self.sequence_length = sequence_length
203205
self.lowercase = lowercase
@@ -207,6 +209,15 @@ def __init__(
207209
self.suffix_indicator = suffix_indicator
208210
self.oov_token = oov_token
209211

212+
if oov_token not in self.vocabulary:
213+
raise RuntimeError(
214+
f'Cannot find `oov_token="{self.oov_token}"` in the '
215+
"vocabulary.\n"
216+
"You can either update the vocabulary to include "
217+
f'`"{self.oov_token}"`, or pass a different value for '
218+
"the `oov_token` argument when creating the tokenizer."
219+
)
220+
210221
self._fast_word_piece = tf_text.FastWordpieceTokenizer(
211222
vocab=self.vocabulary,
212223
token_out_type=self.compute_dtype,

keras_nlp/tokenizers/word_piece_tokenizer_test.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,3 +223,25 @@ def test_saving(self):
223223
model(input_data),
224224
restored_model(input_data),
225225
)
226+
227+
def test_no_oov_token_in_vocabulary(self):
228+
vocab_data = ["qu", "@@ick", "br", "@@OWN", "fox"]
229+
with self.assertRaises(RuntimeError):
230+
WordPieceTokenizer(
231+
vocabulary=vocab_data,
232+
)
233+
234+
vocab_data = ["@UNK@", "qu", "@@ick", "br", "@@OWN", "fox"]
235+
with self.assertRaises(RuntimeError):
236+
WordPieceTokenizer(
237+
vocabulary=vocab_data,
238+
)
239+
240+
vocab_data = ["UNK", "qu", "@@ick", "br", "@@OWN", "fox"]
241+
with self.assertRaises(RuntimeError):
242+
WordPieceTokenizer(
243+
vocabulary=vocab_data,
244+
)
245+
246+
with self.assertRaises(ValueError):
247+
WordPieceTokenizer(vocabulary=vocab_data, oov_token=None)

0 commit comments

Comments
 (0)