Merge branch 'main' into ort-improvements

xenova · xenova · commit ca9765c31159 · 2025-06-03T17:50:29.000-04:00
diff --git a/README.md b/README.md
diff --git a/docs/snippets/2_installation.snippet b/docs/snippets/2_installation.snippet
@@ -7,6 +7,6 @@ npm i @huggingface/transformers
 Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
 ```html
 <script type="module">
-    import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.5.1';
+    import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.5.2';
 </script>
 ```
diff --git a/docs/snippets/4_custom-usage.snippet b/docs/snippets/4_custom-usage.snippet
@@ -1,6 +1,6 @@
 
 
-By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.5.1/dist/), which should work out-of-the-box. You can customize this as follows:
+By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.5.2/dist/), which should work out-of-the-box. You can customize this as follows:
 
 ### Settings
 
diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/transformers",
-  "version": "3.5.1",
+  "version": "3.5.2",
   "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
   "main": "./src/transformers.js",
   "types": "./types/transformers.d.ts",
diff --git a/src/env.js b/src/env.js
@@ -26,7 +26,7 @@ import fs from 'fs';
 import path from 'path';
 import url from 'url';
 
-const VERSION = '3.5.1';
+const VERSION = '3.5.2';
 
 // Check if various APIs are available (depends on environment)
 const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
diff --git a/src/generation/configuration_utils.js b/src/generation/configuration_utils.js
@@ -77,7 +77,7 @@ export class GenerationConfig {
 
     /**
      * Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-     * See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+     * See [this paper](https://huggingface.co/papers/1610.02424) for more details.
      * @type {number}
      * @default 1
      */
@@ -122,7 +122,7 @@ export class GenerationConfig {
     /**
      * Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated.
      * If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to `typical_p` or higher are kept for generation.
-     * See [this paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
+     * See [this paper](https://huggingface.co/papers/2202.00666) for more details.
      * @type {number}
      * @default 1.0
      */
@@ -131,7 +131,7 @@ export class GenerationConfig {
     /**
      * If set to float strictly between 0 and 1, only tokens with a conditional probability greater than `epsilon_cutoff` will be sampled.
      * In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model.
-     * See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details.
+     * See [Truncation Sampling as Language Model Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
      * @type {number}
      * @default 0.0
      */
@@ -141,7 +141,7 @@ export class GenerationConfig {
      * Eta sampling is a hybrid of locally typical sampling and epsilon sampling.
      * If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`.
      * The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
-     * See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details.
+     * See [Truncation Sampling as Language Model Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
      * @type {number}
      * @default 0.0
      */
@@ -157,7 +157,7 @@ export class GenerationConfig {
 
     /**
      * The parameter for repetition penalty. 1.0 means no penalty.
-     * See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+     * See [this paper](https://huggingface.co/papers/1909.05858) for more details.
      * @type {number}
      * @default 1.0
      */
diff --git a/src/generation/logits_process.js b/src/generation/logits_process.js
@@ -410,7 +410,7 @@ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor {
  * This penalty is applied at most once per token. Note that, for decoder-only models like most LLMs,
  * the considered tokens include the prompt.
  * 
- * In the original [paper](https://arxiv.org/pdf/1909.05858.pdf), the authors suggest the use of a
+ * In the original [paper](https://huggingface.co/papers/1909.05858), the authors suggest the use of a
  * penalty of around 1.2 to achieve a good balance between truthful generation and lack of repetition.
  * To penalize and reduce repetition, use `penalty` values above 1.0, where a higher value penalizes
  * more strongly. To reward and encourage repetition, use `penalty` values between 0.0 and 1.0, where
@@ -580,7 +580,7 @@ export class NoBadWordsLogitsProcessor extends LogitsProcessor {
  * correspond to the unconditional logits (predicted from an empty or 'null' prompt). The processor computes a
  * weighted average across the conditional and unconditional logits, parameterised by the `guidance_scale`.
  * 
- * See [the paper](https://arxiv.org/abs/2306.05284) for more information.
+ * See [the paper](https://huggingface.co/papers/2306.05284) for more information.
  */
 export class ClassifierFreeGuidanceLogitsProcessor extends LogitsProcessor {
 
diff --git a/src/generation/streamers.js b/src/generation/streamers.js
@@ -208,7 +208,11 @@ export class WhisperTextStreamer extends TextStreamer {
                     this.on_chunk_start?.(time);
                 }
                 this.waiting_for_timestamp = !this.waiting_for_timestamp; // Toggle
-                value = [[]]; // Skip timestamp
+
+                // NOTE: Timestamp tokens should not be printed. Although, since they
+                // aren't classified as "special tokens", we need to handle them here.
+                this.token_callback_function?.(tokens);
+                return;
             }
         }
         return super.put(value);
diff --git a/src/models.js b/src/models.js
@@ -7055,7 +7055,7 @@ export class DecisionTransformerPreTrainedModel extends PreTrainedModel { }
 
 /**
  * The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting.
- * Refer to the paper for more details: https://arxiv.org/abs/2106.01345
+ * Refer to the paper for more details: https://huggingface.co/papers/2106.01345
  */
 export class DecisionTransformerModel extends DecisionTransformerPreTrainedModel { }
 
diff --git a/src/pipelines.js b/src/pipelines.js
@@ -1912,7 +1912,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
         for (const aud of preparedAudios) {
             const inputs = await this.processor(aud);
 
-            // According to the [paper](https://arxiv.org/pdf/2410.15608):
+            // According to the [paper](https://huggingface.co/papers/2410.15608):
             // "We use greedy decoding, with a heuristic limit of 6 output tokens
             // per second of audio to avoid repeated output sequences."
             const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -366,15 +366,19 @@ export class TokenizerModel extends Callable {
                 return new BPE(config);
 
             default:
-                // Some older tokenizers, like `google-t5/t5-small` and `distilbert/distilbert-base-uncased`, do not have a `type` field.
+                // Some older tokenizers, like `google-t5/t5-small`, `openai-community/gpt2`, and `distilbert/distilbert-base-uncased`, do not have a `type` field.
                 // In this case, we can infer the tokenizer type based on the structure of the `vocab` field and other properties.
                 if (config.vocab) {
                     if (Array.isArray(config.vocab)) {
                         // config.vocab is of type `[string, number][]`
                         // @ts-ignore
                         return new Unigram(config, ...args);
-                    } else if (typeof config.vocab === 'object' && config.continuing_subword_prefix && config.unk_token) {
-                        return new WordPieceTokenizer(config);
+                    } else if (Object.hasOwn(config, 'continuing_subword_prefix') && Object.hasOwn(config, 'unk_token')) {
+                        if (Object.hasOwn(config, 'merges')) {
+                            return new BPE(config);
+                        } else {
+                            return new WordPieceTokenizer(config);
+                        }
                     } else {
                         // @ts-ignore
                         return new LegacyTokenizerModel(config, ...args);
@@ -3515,7 +3519,7 @@ function _build_translation_inputs(self, raw_inputs, tokenizer_options, generate
  * between any pair of 200+ languages — including low-resource languages like Asturian,
  * Luganda, Urdu and more. It aims to help people communicate with anyone, anywhere,
  * regardless of their language preferences. For more information, check out their
- * [paper](https://arxiv.org/abs/2207.04672).
+ * [paper](https://huggingface.co/papers/2207.04672).
  * 
  * For a list of supported languages (along with their language codes),
  * @see {@link https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200}
@@ -3546,7 +3550,7 @@ export class NllbTokenizer extends PreTrainedTokenizer {
  * The M2M100Tokenizer class is used to tokenize text for M2M100 ("Many-to-Many") models.
  * 
  * M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many
- * multilingual translation. It was introduced in this [paper](https://arxiv.org/abs/2010.11125)
+ * multilingual translation. It was introduced in this [paper](https://huggingface.co/papers/2010.11125)
  * and first released in [this](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100) repository.
  * 
  * For a list of supported languages (along with their language codes),
diff --git a/src/transformers.js b/src/transformers.js
@@ -40,3 +40,11 @@ export * from './models/auto/processing_auto.js';
 export * from './generation/streamers.js';
 export * from './generation/stopping_criteria.js';
 export * from './generation/logits_process.js';
+
+// Expose common types used across the library for developers to access
+/**
+ * @typedef {import('./utils/hub.js').PretrainedModelOptions} PretrainedModelOptions
+ * @typedef {import('./base/processing_utils.js').PretrainedProcessorOptions} PretrainedProcessorOptions
+ * @typedef {import('./utils/dtypes.js').DataType} DataType
+ * @typedef {import('./utils/devices.js').DeviceType} DeviceType
+ */
diff --git a/tests/models/gpt2/test_tokenization_gpt2.js b/tests/models/gpt2/test_tokenization_gpt2.js
@@ -460,3 +460,6 @@ export const TEST_CONFIG = {
     },
   },
 };
+
+// Test that tokenizer type can be inferred (`type: "BPE"` is missing)
+TEST_CONFIG["openai-community/gpt2"] = TEST_CONFIG["Xenova/gpt2"];

-Original file line number
+Diff line change
 Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
 ```html
 <script type="module">
 -    import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected].1';
 +    import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected].2';
 </script>
 ```
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`
`2`	`2`
`3`		`-By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/[email protected].1/dist/), which should work out-of-the-box. You can customize this as follows:`
	`3`	`+By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/[email protected].2/dist/), which should work out-of-the-box. You can customize this as follows:`
`4`	`4`
`5`	`5`	`### Settings`
`6`	`6`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@huggingface/transformers",`
`3`		`- "version": "3.5.1",`
	`3`	`+ "version": "3.5.2",`
`4`	`4`	`"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",`
`5`	`5`	`"main": "./src/transformers.js",`
`6`	`6`	`"types": "./types/transformers.d.ts",`
Original file line number	Diff line number	Diff line change
`@@ -208,7 +208,11 @@ export class WhisperTextStreamer extends TextStreamer {`
`208`	`208`	`this.on_chunk_start?.(time);`
`209`	`209`	`}`
`210`	`210`	`this.waiting_for_timestamp = !this.waiting_for_timestamp; // Toggle`
`211`		`- value = [[]]; // Skip timestamp`
	`211`	`+`
	`212`	`+ // NOTE: Timestamp tokens should not be printed. Although, since they`
	`213`	`+ // aren't classified as "special tokens", we need to handle them here.`
	`214`	`+ this.token_callback_function?.(tokens);`
	`215`	`+ return;`
`212`	`216`	`}`
`213`	`217`	`}`
`214`	`218`	`return super.put(value);`