Skip to content

Commit e3cdc95

Browse files
authored
Add tokenizer helper to convert tokens to ids (#75)
And from ids to tokens.
1 parent 2e9ea8d commit e3cdc95

File tree

2 files changed

+15
-0
lines changed

2 files changed

+15
-0
lines changed

keras_nlp/tokenizers/word_piece_tokenizer.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,17 @@ def vocabulary_size(self) -> int:
225225
"""Get the size of the tokenizer vocabulary."""
226226
return len(self._vocab)
227227

228+
def id_to_token(self, id: int) -> str:
229+
"""Convert an integer id to a string token."""
230+
return self._vocab[id]
231+
232+
def token_to_id(self, token: str) -> int:
233+
"""Convert a string token to an integer id."""
234+
# This will be slow, but keep memory usage down compared to building a
235+
# dict. Assuming the main use case is looking up a few special tokens
236+
# early in the vocab, this should be fine.
237+
return self._vocab.index(token)
238+
228239
def get_config(self) -> Dict[str, Any]:
229240
config = super().get_config()
230241
config.update(

keras_nlp/tokenizers/word_piece_tokenizer_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ def test_accessors(self):
6666
tokenizer.get_vocabulary(),
6767
["[UNK]", "the", "qu", "##ick", "br", "##own", "fox"],
6868
)
69+
self.assertEqual(tokenizer.id_to_token(0), "[UNK]")
70+
self.assertEqual(tokenizer.id_to_token(6), "fox")
71+
self.assertEqual(tokenizer.token_to_id("[UNK]"), 0)
72+
self.assertEqual(tokenizer.token_to_id("fox"), 6)
6973

7074
def test_special_tokens(self):
7175
input_data = ["quick brown whale"]

0 commit comments

Comments
 (0)