|
17 | 17 |
|
18 | 18 | @keras_nlp_export("keras_nlp.models.PaliGemmaTokenizer") |
19 | 19 | class PaliGemmaTokenizer(GemmaTokenizer): |
| 20 | + """PaliGemma tokenizer layer based on SentencePiece. |
| 21 | +
|
| 22 | + This tokenizer class will tokenize raw strings into integer sequences and |
| 23 | + is based on `keras_nlp.tokenizers.SentencePieceTokenizer`. Unlike the |
| 24 | + underlying tokenizer, it will check for all special tokens needed by |
| 25 | + PaliGemma models and provides a `from_preset()` method to automatically |
| 26 | + download a matching vocabulary for a PaliGemma preset. |
| 27 | +
|
| 28 | + If input is a batch of strings (rank > 0), the layer will output a |
| 29 | + `tf.RaggedTensor` where the last dimension of the output is ragged. |
| 30 | +
|
| 31 | + If input is a scalar string (rank == 0), the layer will output a dense |
| 32 | + `tf.Tensor` with static shape `[None]`. |
| 33 | +
|
| 34 | + Args: |
| 35 | + proto: Either a `string` path to a SentencePiece proto file, or a |
| 36 | + `bytes` object with a serialized SentencePiece proto. See the |
| 37 | + [SentencePiece repository](https://github.com/google/sentencepiece) |
| 38 | + for more details on the format. |
| 39 | +
|
| 40 | + Examples: |
| 41 | +
|
| 42 | + ```python |
| 43 | + # Unbatched input. |
| 44 | + tokenizer = keras_nlp.models.PaliGemmaTokenizer.from_preset( |
| 45 | + "pali_gemma_3b_224" |
| 46 | + ) |
| 47 | + tokenizer("The quick brown fox jumped.") |
| 48 | +
|
| 49 | + # Batched input. |
| 50 | + tokenizer(["The quick brown fox jumped.", "The fox slept."]) |
| 51 | +
|
| 52 | + # Detokenization. |
| 53 | + tokenizer.detokenize(tokenizer("The quick brown fox jumped.")) |
| 54 | +
|
| 55 | + # Custom vocabulary. |
| 56 | + bytes_io = io.BytesIO() |
| 57 | + ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."]) |
| 58 | + sentencepiece.SentencePieceTrainer.train( |
| 59 | + sentence_iterator=ds.as_numpy_iterator(), |
| 60 | + model_writer=bytes_io, |
| 61 | + vocab_size=8, |
| 62 | + model_type="WORD", |
| 63 | + pad_id=0, |
| 64 | + bos_id=1, |
| 65 | + eos_id=2, |
| 66 | + unk_id=3, |
| 67 | + pad_piece="<pad>", |
| 68 | + bos_piece="<bos>", |
| 69 | + eos_piece="<eos>", |
| 70 | + unk_piece="<unk>", |
| 71 | + ) |
| 72 | + tokenizer = keras_nlp.models.PaliGemmaTokenizer( |
| 73 | + proto=bytes_io.getvalue(), |
| 74 | + ) |
| 75 | + tokenizer("The quick brown fox jumped.") |
| 76 | + ``` |
| 77 | + """ |
| 78 | + |
20 | 79 | pass |
0 commit comments