huggingface · xenova · May 20, 2025 · May 17, 2025
diff --git a/README.md b/README.md
diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
diff --git a/src/generation/configuration_utils.js b/src/generation/configuration_utils.js
@@ -77,7 +77,7 @@ export class GenerationConfig {
 
     /**
      * Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-     * See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+     * See [this paper](https://huggingface.co/papers/1610.02424) for more details.
      * @type {number}
      * @default 1
      */
@@ -122,7 +122,7 @@ export class GenerationConfig {
     /**
      * Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated.
      * If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to `typical_p` or higher are kept for generation.
-     * See [this paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
+     * See [this paper](https://huggingface.co/papers/2202.00666) for more details.
      * @type {number}
      * @default 1.0
      */
@@ -131,7 +131,7 @@ export class GenerationConfig {
     /**
      * If set to float strictly between 0 and 1, only tokens with a conditional probability greater than `epsilon_cutoff` will be sampled.
      * In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model.
-     * See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details.
+     * See [Truncation Sampling as Language Model Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
      * @type {number}
      * @default 0.0
      */
@@ -141,7 +141,7 @@ export class GenerationConfig {
      * Eta sampling is a hybrid of locally typical sampling and epsilon sampling.
      * If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`.
      * The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
-     * See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details.
+     * See [Truncation Sampling as Language Model Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
      * @type {number}
      * @default 0.0
      */
@@ -157,7 +157,7 @@ export class GenerationConfig {
 
     /**
      * The parameter for repetition penalty. 1.0 means no penalty.
-     * See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+     * See [this paper](https://huggingface.co/papers/1909.05858) for more details.
      * @type {number}
      * @default 1.0
      */

diff --git a/src/generation/logits_process.js b/src/generation/logits_process.js
@@ -410,7 +410,7 @@ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor {
  * This penalty is applied at most once per token. Note that, for decoder-only models like most LLMs,
  * the considered tokens include the prompt.
  * 
- * In the original [paper](https://arxiv.org/pdf/1909.05858.pdf), the authors suggest the use of a
+ * In the original [paper](https://huggingface.co/papers/1909.05858), the authors suggest the use of a
  * penalty of around 1.2 to achieve a good balance between truthful generation and lack of repetition.
  * To penalize and reduce repetition, use `penalty` values above 1.0, where a higher value penalizes
  * more strongly. To reward and encourage repetition, use `penalty` values between 0.0 and 1.0, where
@@ -580,7 +580,7 @@ export class NoBadWordsLogitsProcessor extends LogitsProcessor {
  * correspond to the unconditional logits (predicted from an empty or 'null' prompt). The processor computes a
  * weighted average across the conditional and unconditional logits, parameterised by the `guidance_scale`.
  * 
- * See [the paper](https://arxiv.org/abs/2306.05284) for more information.
+ * See [the paper](https://huggingface.co/papers/2306.05284) for more information.
  */
 export class ClassifierFreeGuidanceLogitsProcessor extends LogitsProcessor {
 

diff --git a/src/models.js b/src/models.js
@@ -7053,7 +7053,7 @@ export class DecisionTransformerPreTrainedModel extends PreTrainedModel { }
 
 /**
  * The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting.
- * Refer to the paper for more details: https://arxiv.org/abs/2106.01345
+ * Refer to the paper for more details: https://huggingface.co/papers/2106.01345
  */
 export class DecisionTransformerModel extends DecisionTransformerPreTrainedModel { }
 

diff --git a/src/pipelines.js b/src/pipelines.js
@@ -1912,7 +1912,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
         for (const aud of preparedAudios) {
             const inputs = await this.processor(aud);
 
-            // According to the [paper](https://arxiv.org/pdf/2410.15608):
+            // According to the [paper](https://huggingface.co/papers/2410.15608):
             // "We use greedy decoding, with a heuristic limit of 6 output tokens
             // per second of audio to avoid repeated output sequences."
             const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;

diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -3519,7 +3519,7 @@ function _build_translation_inputs(self, raw_inputs, tokenizer_options, generate
  * between any pair of 200+ languages — including low-resource languages like Asturian,
  * Luganda, Urdu and more. It aims to help people communicate with anyone, anywhere,
  * regardless of their language preferences. For more information, check out their
- * [paper](https://arxiv.org/abs/2207.04672).
+ * [paper](https://huggingface.co/papers/2207.04672).
  * 
  * For a list of supported languages (along with their language codes),
  * @see {@link https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200}
@@ -3550,7 +3550,7 @@ export class NllbTokenizer extends PreTrainedTokenizer {
  * The M2M100Tokenizer class is used to tokenize text for M2M100 ("Many-to-Many") models.
  * 
  * M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many
- * multilingual translation. It was introduced in this [paper](https://arxiv.org/abs/2010.11125)
+ * multilingual translation. It was introduced in this [paper](https://huggingface.co/papers/2010.11125)
  * and first released in [this](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100) repository.
  * 
  * For a list of supported languages (along with their language codes),