huggingface · xenova · May 13, 2025 · May 10, 2025 · May 10, 2025
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -366,15 +366,19 @@ export class TokenizerModel extends Callable {
                 return new BPE(config);
 
             default:
-                // Some older tokenizers, like `google-t5/t5-small` and `distilbert/distilbert-base-uncased`, do not have a `type` field.
+                // Some older tokenizers, like `google-t5/t5-small`, `openai-community/gpt2`, and `distilbert/distilbert-base-uncased`, do not have a `type` field.
                 // In this case, we can infer the tokenizer type based on the structure of the `vocab` field and other properties.
                 if (config.vocab) {
                     if (Array.isArray(config.vocab)) {
                         // config.vocab is of type `[string, number][]`
                         // @ts-ignore
                         return new Unigram(config, ...args);
-                    } else if (typeof config.vocab === 'object' && config.continuing_subword_prefix && config.unk_token) {
-                        return new WordPieceTokenizer(config);
+                    } else if (Object.hasOwn(config, 'continuing_subword_prefix') && Object.hasOwn(config, 'unk_token')) {
+                        if (Object.hasOwn(config, 'merges')) {
+                            return new BPE(config);
+                        } else {
+                            return new WordPieceTokenizer(config);
+                        }
                     } else {
                         // @ts-ignore
                         return new LegacyTokenizerModel(config, ...args);

diff --git a/tests/models/gpt2/test_tokenization_gpt2.js b/tests/models/gpt2/test_tokenization_gpt2.js
@@ -460,3 +460,6 @@ export const TEST_CONFIG = {
     },
   },
 };
+
+// Test that tokenizer type can be inferred (`type: "BPE"` is missing)
+TEST_CONFIG["openai-community/gpt2"] = TEST_CONFIG["Xenova/gpt2"];
-Original file line number
+Diff line change
@@ Expand Up / @@ -460,3 +460,6 @@ export const TEST_CONFIG = { @@
         },
       },
     };
+    // Test that tokenizer type can be inferred (`type: "BPE"` is missing)
+    TEST_CONFIG["openai-community/gpt2"] = TEST_CONFIG["Xenova/gpt2"];