@@ -30,19 +30,25 @@ class ByteTokenizer(tokenizer.Tokenizer):
3030 This tokenizer is a vocabulary-free tokenizer which will tokenize text as
3131 as raw bytes from [0, 256).
3232
33+ Tokenizer outputs can either be padded and truncated with a
34+ `sequence_length` argument, or left un-truncated. The exact output will
35+ depend on the rank of the input tensors.
36+
3337 If input is a batch of strings:
3438 By default, the layer will output a `tf.RaggedTensor` where the last
3539 dimension of the output is ragged. If `sequence_length` is set, the layer
3640 will output a dense `tf.Tensor` where all inputs have been padded or
37- truncated to `sequence_length`. The output dtype can be controlled via the
38- `dtype` argument, which should be an integer type
39- (tf.int16, tf.int32, etc.).
41+ truncated to `sequence_length`.
4042
4143 If input is a scalar string:
4244 There are two cases here. If `sequence_length` is set, the output will be
4345 a dense `tf.Tensor` of shape `[sequence_length]`. Otherwise, the output will
4446 be a dense `tf.Tensor` of shape `[None]`.
4547
48+ The output dtype can be controlled via the
49+ `dtype` argument, which should be an integer type
50+ (tf.int16, tf.int32, etc.).
51+
4652 Args:
4753 lowercase: boolean. If True, the input text will be converted to
4854 lowercase before tokenization.
@@ -89,22 +95,22 @@ class ByteTokenizer(tokenizer.Tokenizer):
8995 <tf.Tensor: shape=(1, 8), dtype=int32, numpy=
9096 array([[104, 101, 108, 108, 111, 0, 0, 0]], dtype=int32)>
9197
92- Tokenize first , then batch the dataset up .
98+ Tokenize, then batch for ragged outputs .
9399 >>> tokenizer = keras_nlp.tokenizers.ByteTokenizer()
94100 >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
95101 >>> ds = ds.map(tokenizer)
96102 >>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(2))
97103 >>> ds.take(1).get_single_element()
98104 <tf.RaggedTensor [[104, 101, 108, 108, 111], [102, 117, 110]]>
99105
100- Batch the inputs and then tokenize .
106+ Batch, then tokenize for ragged outputs .
101107 >>> tokenizer = keras_nlp.tokenizers.ByteTokenizer()
102108 >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
103109 >>> ds = ds.batch(2).map(tokenizer)
104110 >>> ds.take(1).get_single_element()
105111 <tf.RaggedTensor [[104, 101, 108, 108, 111], [102, 117, 110]]>
106112
107- Tokenize first , then batch the dataset up (`sequence_length` provided).
113+ Tokenize, then batch for dense outputs (`sequence_length` provided).
108114 >>> tokenizer = keras_nlp.tokenizers.ByteTokenizer(sequence_length=5)
109115 >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
110116 >>> ds = ds.map(tokenizer)
@@ -114,7 +120,7 @@ class ByteTokenizer(tokenizer.Tokenizer):
114120 array([[104, 101, 108, 108, 111],
115121 [102, 117, 110, 0, 0]], dtype=int32)>
116122
117- Batch the inputs and then tokenize (`sequence_length` provided).
123+ Batch, then tokenize for dense outputs. (`sequence_length` provided).
118124 >>> tokenizer = keras_nlp.tokenizers.ByteTokenizer(sequence_length=5)
119125 >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
120126 >>> ds = ds.batch(2).map(tokenizer)
0 commit comments