@@ -187,8 +187,7 @@ class BytePairTokenizer(tokenizer.Tokenizer):
187187
188188 Examples:
189189
190- Use in-memory vocabulary and merge list.
191-
190+ Tokenize
192191 >>> vocab = {"butter": 1, "fly": 2}
193192 >>> merge = ["b u", "t t", "e r", "bu tt", "butt er", "f l", "fl y"]
194193 >>> tokenizer = keras_nlp.tokenizers.BytePairTokenizer(vocab, merge)
@@ -205,31 +204,13 @@ class BytePairTokenizer(tokenizer.Tokenizer):
205204 array([[1, 2],
206205 [1, 0]], dtype=int32)>
207206
208- Use hosted vocabluary and merge list.
209-
210- ```python
211- vocab_path = tf.keras.utils.get_file(
212- "vocab.json",
213- "https://storage.googleapis.com/keras-nlp/models/roberta_base/vocab.json",
214- )
215- merge_path = tf.keras.utils.get_file(
216- "merges.txt",
217- "https://storage.googleapis.com/keras-nlp/models/roberta_base/merges.txt",
218- )
219- tokenizer = BytePairTokenizer(
220- vocabulary=vocab_path, merges=merge_path
221- )
222- tokenizer("Butterfly is not flying butter!")
223- ```
224-
225207 Detokenize
226208 >>> vocab = {"butter": 1, "fly": 2}
227209 >>> merge = ["b u", "t t", "e r", "bu tt", "butt er", "f l", "fl y"]
228210 >>> tokenizer = keras_nlp.tokenizers.BytePairTokenizer(vocab, merge)
229211 >>> tokenizer.detokenize([[1, 2]])
230212 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'butterfly'],
231213 dtype=object)>
232-
233214 """
234215
235216 def __init__ (
0 commit comments