Skip to content

Commit e5ce378

Browse files
jbischofmattdangerw
authored andcommitted
initial commit (#440)
1 parent e575e98 commit e5ce378

File tree

1 file changed

+1
-20
lines changed

1 file changed

+1
-20
lines changed

keras_nlp/tokenizers/byte_pair_tokenizer.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,7 @@ class BytePairTokenizer(tokenizer.Tokenizer):
187187
188188
Examples:
189189
190-
Use in-memory vocabulary and merge list.
191-
190+
Tokenize
192191
>>> vocab = {"butter": 1, "fly": 2}
193192
>>> merge = ["b u", "t t", "e r", "bu tt", "butt er", "f l", "fl y"]
194193
>>> tokenizer = keras_nlp.tokenizers.BytePairTokenizer(vocab, merge)
@@ -205,31 +204,13 @@ class BytePairTokenizer(tokenizer.Tokenizer):
205204
array([[1, 2],
206205
[1, 0]], dtype=int32)>
207206
208-
Use hosted vocabluary and merge list.
209-
210-
```python
211-
vocab_path = tf.keras.utils.get_file(
212-
"vocab.json",
213-
"https://storage.googleapis.com/keras-nlp/models/roberta_base/vocab.json",
214-
)
215-
merge_path = tf.keras.utils.get_file(
216-
"merges.txt",
217-
"https://storage.googleapis.com/keras-nlp/models/roberta_base/merges.txt",
218-
)
219-
tokenizer = BytePairTokenizer(
220-
vocabulary=vocab_path, merges=merge_path
221-
)
222-
tokenizer("Butterfly is not flying butter!")
223-
```
224-
225207
Detokenize
226208
>>> vocab = {"butter": 1, "fly": 2}
227209
>>> merge = ["b u", "t t", "e r", "bu tt", "butt er", "f l", "fl y"]
228210
>>> tokenizer = keras_nlp.tokenizers.BytePairTokenizer(vocab, merge)
229211
>>> tokenizer.detokenize([[1, 2]])
230212
<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'butterfly'],
231213
dtype=object)>
232-
233214
"""
234215

235216
def __init__(

0 commit comments

Comments
 (0)