Skip to content

Commit 858babb

Browse files
committed
Add a tokenizer docstring for pali gemma
Otherwise the symbol would have mismatched code examples
1 parent 09183e9 commit 858babb

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed

keras_nlp/src/models/pali_gemma/pali_gemma_tokenizer.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,63 @@
1717

1818
@keras_nlp_export("keras_nlp.models.PaliGemmaTokenizer")
1919
class PaliGemmaTokenizer(GemmaTokenizer):
20+
"""PaliGemma tokenizer layer based on SentencePiece.
21+
22+
This tokenizer class will tokenize raw strings into integer sequences and
23+
is based on `keras_nlp.tokenizers.SentencePieceTokenizer`. Unlike the
24+
underlying tokenizer, it will check for all special tokens needed by
25+
PaliGemma models and provides a `from_preset()` method to automatically
26+
download a matching vocabulary for a PaliGemma preset.
27+
28+
If input is a batch of strings (rank > 0), the layer will output a
29+
`tf.RaggedTensor` where the last dimension of the output is ragged.
30+
31+
If input is a scalar string (rank == 0), the layer will output a dense
32+
`tf.Tensor` with static shape `[None]`.
33+
34+
Args:
35+
proto: Either a `string` path to a SentencePiece proto file, or a
36+
`bytes` object with a serialized SentencePiece proto. See the
37+
[SentencePiece repository](https://github.com/google/sentencepiece)
38+
for more details on the format.
39+
40+
Examples:
41+
42+
```python
43+
# Unbatched input.
44+
tokenizer = keras_nlp.models.PaliGemmaTokenizer.from_preset(
45+
"pali_gemma_3b_224"
46+
)
47+
tokenizer("The quick brown fox jumped.")
48+
49+
# Batched input.
50+
tokenizer(["The quick brown fox jumped.", "The fox slept."])
51+
52+
# Detokenization.
53+
tokenizer.detokenize(tokenizer("The quick brown fox jumped."))
54+
55+
# Custom vocabulary.
56+
bytes_io = io.BytesIO()
57+
ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."])
58+
sentencepiece.SentencePieceTrainer.train(
59+
sentence_iterator=ds.as_numpy_iterator(),
60+
model_writer=bytes_io,
61+
vocab_size=8,
62+
model_type="WORD",
63+
pad_id=0,
64+
bos_id=1,
65+
eos_id=2,
66+
unk_id=3,
67+
pad_piece="<pad>",
68+
bos_piece="<bos>",
69+
eos_piece="<eos>",
70+
unk_piece="<unk>",
71+
)
72+
tokenizer = keras_nlp.models.PaliGemmaTokenizer(
73+
proto=bytes_io.getvalue(),
74+
)
75+
tokenizer("The quick brown fox jumped.")
76+
```
77+
"""
78+
2079
pass

0 commit comments

Comments
 (0)