Add tokenizer helper to convert tokens to ids (#75)

mattdangerw · web-flow · commit e3cdc95ce176 · 2022-03-29T17:07:54.000-07:00
And from ids to tokens.
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py
@@ -225,6 +225,17 @@ def vocabulary_size(self) -> int:
         """Get the size of the tokenizer vocabulary."""
         return len(self._vocab)
 
+    def id_to_token(self, id: int) -> str:
+        """Convert an integer id to a string token."""
+        return self._vocab[id]
+
+    def token_to_id(self, token: str) -> int:
+        """Convert a string token to an integer id."""
+        # This will be slow, but keep memory usage down compared to building a
+        # dict. Assuming the main use case is looking up a few special tokens
+        # early in the vocab, this should be fine.
+        return self._vocab.index(token)
+
     def get_config(self) -> Dict[str, Any]:
         config = super().get_config()
         config.update(
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
@@ -66,6 +66,10 @@ def test_accessors(self):
             tokenizer.get_vocabulary(),
             ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox"],
         )
+        self.assertEqual(tokenizer.id_to_token(0), "[UNK]")
+        self.assertEqual(tokenizer.id_to_token(6), "fox")
+        self.assertEqual(tokenizer.token_to_id("[UNK]"), 0)
+        self.assertEqual(tokenizer.token_to_id("fox"), 6)
 
     def test_special_tokens(self):
         input_data = ["quick brown whale"]