Skip to content

Commit 440ad23

Browse files
committed
Fix tokenization padding/truncation logic
1 parent 5e1171c commit 440ad23

File tree

1 file changed

+13
-5
lines changed

1 file changed

+13
-5
lines changed

src/tokenizers.js

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2787,13 +2787,21 @@ export class PreTrainedTokenizer extends Callable {
27872787
// For single input, we just wrap in an array, and then unwrap later.
27882788
encodedTokens = [this._encode_plus(text, { text_pair, add_special_tokens, return_token_type_ids })];
27892789
}
2790-
// At this point, tokens is batched: [batch_size, tokens]
2791-
// However, array may be jagged. So, we pad to max_length
2792-
2793-
if (truncation && max_length === null) {
2790+
// At this point, `encodedTokens` is batched, of shape [batch_size, tokens].
2791+
// However, array may be jagged. So, we may need pad to max_length.
2792+
if (max_length === null) {
27942793
max_length = this.model_max_length;
27952794
} else if (max_length && truncation === null) {
2796-
console.warn(`Truncation was not explicitly activated but \`max_length\` is provided a specific value, please use \`truncation=true\` to explicitly truncate examples to max length.`)
2795+
if (padding === true) {
2796+
console.warn(
2797+
"`max_length` is ignored when `padding: true` and there is no truncation strategy. " +
2798+
"To pad to max length, use `padding: 'max_length'`."
2799+
)
2800+
max_length = this.model_max_length;
2801+
} else if (padding === false) {
2802+
console.warn("Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation: true` to explicitly truncate examples to max length.");
2803+
truncation = true;
2804+
}
27972805
}
27982806

27992807
// padding: 'max_length' doesn't require any additional calculation

0 commit comments

Comments
 (0)