diff --git a/src/tokenizers.js b/src/tokenizers.js index a8d64407d..899bbe65e 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -366,15 +366,19 @@ export class TokenizerModel extends Callable { return new BPE(config); default: - // Some older tokenizers, like `google-t5/t5-small` and `distilbert/distilbert-base-uncased`, do not have a `type` field. + // Some older tokenizers, like `google-t5/t5-small`, `openai-community/gpt2`, and `distilbert/distilbert-base-uncased`, do not have a `type` field. // In this case, we can infer the tokenizer type based on the structure of the `vocab` field and other properties. if (config.vocab) { if (Array.isArray(config.vocab)) { // config.vocab is of type `[string, number][]` // @ts-ignore return new Unigram(config, ...args); - } else if (typeof config.vocab === 'object' && config.continuing_subword_prefix && config.unk_token) { - return new WordPieceTokenizer(config); + } else if (Object.hasOwn(config, 'continuing_subword_prefix') && Object.hasOwn(config, 'unk_token')) { + if (Object.hasOwn(config, 'merges')) { + return new BPE(config); + } else { + return new WordPieceTokenizer(config); + } } else { // @ts-ignore return new LegacyTokenizerModel(config, ...args); diff --git a/tests/models/gpt2/test_tokenization_gpt2.js b/tests/models/gpt2/test_tokenization_gpt2.js index c573e7b5c..3f4b27b64 100644 --- a/tests/models/gpt2/test_tokenization_gpt2.js +++ b/tests/models/gpt2/test_tokenization_gpt2.js @@ -460,3 +460,6 @@ export const TEST_CONFIG = { }, }, }; + +// Test that tokenizer type can be inferred (`type: "BPE"` is missing) +TEST_CONFIG["openai-community/gpt2"] = TEST_CONFIG["Xenova/gpt2"];