huggingface
diff --git a/‎src/transformers/configuration_utils.py‎
Lines changed: 3 additions & 3 deletions b/‎src/transformers/configuration_utils.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/convert_slow_tokenizer.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/convert_slow_tokenizer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/hf_argparser.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/hf_argparser.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/image_transforms.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/image_transforms.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/image_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/image_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/modeling_flash_attention_utils.py‎
Lines changed: 5 additions & 5 deletions b/‎src/transformers/modeling_flash_attention_utils.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/transformers/modeling_flax_utils.py‎
Lines changed: 4 additions & 4 deletions b/‎src/transformers/modeling_flax_utils.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/transformers/modeling_gguf_pytorch_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/modeling_gguf_pytorch_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/modeling_tf_utils.py‎
Lines changed: 3 additions & 3 deletions b/‎src/transformers/modeling_tf_utils.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/modeling_utils.py‎
Lines changed: 12 additions & 12 deletions b/‎src/transformers/modeling_utils.py‎
Lines changed: 12 additions & 12 deletions
@@ -191,7 +191,7 @@ class PretrainedConfig(PushToHubMixin):
             v5.
         loss_type (`str`, *optional*):
             The type of loss that the model should use. It should be in `LOSS_MAPPING`'s keys, otherwise the loss will
-            be automatically infered from the model architecture.
+            be automatically inferred from the model architecture.
     """
 
     model_type: str = ""
@@ -254,7 +254,7 @@ def __init__(self, **kwargs):
             if num_labels is not None and len(self.id2label) != num_labels:
                 logger.warning(
                     f"You passed along `num_labels={num_labels}` with an incompatible id to label map: "
-                    f"{self.id2label}. The number of labels wil be overwritten to {self.num_labels}."
+                    f"{self.id2label}. The number of labels will be overwritten to {self.num_labels}."
                 )
             self.id2label = {int(key): value for key, value in self.id2label.items()}
             # Keys are always strings in JSON so convert ids to int here.
@@ -1094,7 +1094,7 @@ def _get_non_default_generation_parameters(self) -> Dict[str, Any]:
                 is_default_in_config = is_default_generation_value = None
                 parameter_value = getattr(self_decoder_config, parameter_name)
                 # Three cases in which is okay for the model config to hold generation config parameters:
-                # 1. The parameter is set to `None`, effectivelly delegating its value to the generation config
+                # 1. The parameter is set to `None`, effectively delegating its value to the generation config
                 if parameter_value is None:
                     continue
                 # 2. If we have a default config, then the instance should hold the same generation defaults
 
@@ -1727,5 +1727,5 @@ def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokeni
             raise ValueError(
                 f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path "
                 f"with a SentencePiece tokenizer.model file."
-                f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
+                f"Currently available slow->fast converters: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
             )
@@ -201,7 +201,7 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
             else:
                 kwargs["required"] = True
         elif field.type is bool or field.type == Optional[bool]:
-            # Copy the currect kwargs to use to instantiate a `no_*` complement argument below.
+            # Copy the correct kwargs to use to instantiate a `no_*` complement argument below.
             # We do not initialize it here because the `no_*` alternative must be instantiated after the real argument
             bool_kwargs = copy(kwargs)
 
 
@@ -585,7 +585,7 @@ def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
 
     center format: contains the coordinate for the center of the box and its width, height dimensions
         (center_x, center_y, width, height)
-    corners format: contains the coodinates for the top-left and bottom-right corners of the box
+    corners format: contains the coordinates for the top-left and bottom-right corners of the box
         (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
     """
     # Function is used during model forward pass, so we use the input framework if possible, without
 
@@ -545,7 +545,7 @@ def default_sample_indices_fn(metadata: VideoMetadata, num_frames=None, fps=None
 
     Args:
         metadata (`VideoMetadata`):
-            `VideoMetadata` object containing metadat about the video, such as "total_num_frames" or "fps".
+            `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
         num_frames (`int`, *optional*):
             Number of frames to sample uniformly.
         fps (`int`, *optional*):
 
@@ -137,9 +137,9 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
     """
     This function returns necessary arguments to call `flash_attn_varlen_func`.
     All three query, key, value states will be flattened.
-    Cummulative lengths of each examples in the batch will be extracted from position_ids.
+    Cumulative lengths of each examples in the batch will be extracted from position_ids.
 
-    NOTE: ideally cummulative lengths should be prepared at the data collator stage
+    NOTE: ideally cumulative lengths should be prepared at the data collator stage
 
     Arguments:
         query (`torch.Tensor`):
@@ -268,7 +268,7 @@ def _flash_attention_forward(
         softmax_scale (`float`, *optional*):
             The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         use_top_left_mask (`bool`, defaults to `False`):
-            flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
+            flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
         softcap (`float`, *optional*):
             Softcap for the attention logits, used e.g. in gemma2.
         deterministic (`bool`, *optional*):
@@ -374,9 +374,9 @@ class FlashAttentionKwargs(TypedDict, total=False):
 
     Attributes:
         cu_seq_lens_q (`torch.LongTensor`, *optional*)
-            Gets cumlative sequence length for query state.
+            Gets cumulative sequence length for query state.
         cu_seq_lens_k (`torch.LongTensor`, *optional*)
-            Gets cumlative sequence length for key state.
+            Gets cumulative sequence length for key state.
         max_length_q (`int`, *optional*):
             Maximum sequence length for query state.
         max_length_k (`int`, *optional*):
 
@@ -367,7 +367,7 @@ def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
 
     def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
         r"""
-        Cast the floating-point `parmas` to `jax.numpy.float32`. This method can be used to explicitly convert the
+        Cast the floating-point `params` to `jax.numpy.float32`. This method can be used to explicitly convert the
         model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in place.
 
         Arguments:
@@ -394,7 +394,7 @@ def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
 
     def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
         r"""
-        Cast the floating-point `parmas` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
+        Cast the floating-point `params` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
         `params` in place.
 
         This method can be used on GPU to explicitly convert the model parameters to float16 precision to do full
@@ -510,7 +510,7 @@ def can_generate(cls) -> bool:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
         # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
-        # Alternativelly, the model can also have a custom `generate` function.
+        # Alternatively, the model can also have a custom `generate` function.
         if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
             return False
         return True
@@ -968,7 +968,7 @@ def from_pretrained(
             )
             cls._missing_keys = missing_keys
 
-        # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
+        # Mismatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
         # matching the weights in the model.
         mismatched_keys = []
         for key in state.keys():
 
@@ -373,7 +373,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
     # to add this patch to ensure things work correctly on our side.
     if "llama" in architecture and "mistral" in model_name:
         updated_architecture = "mistral"
-    # FIXME: Currnetly this implementation is only for flan-t5 architecture.
+    # FIXME: Currently this implementation is only for flan-t5 architecture.
     # It needs to be developed for supporting legacy t5.
     elif "t5" in architecture or "t5encoder" in architecture:
         parsed_parameters["config"]["is_gated_act"] = True
@@ -437,7 +437,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
             logger.info(f"Some keys were not parsed and added into account {gguf_key} | {value}")
 
     # retrieve config vocab_size from tokenizer
-    # Pleas refer to https://github.com/huggingface/transformers/issues/32526 for more details
+    # Please refer to https://github.com/huggingface/transformers/issues/32526 for more details
     if "vocab_size" not in parsed_parameters["config"]:
         tokenizer_parameters = parsed_parameters["tokenizer"]
         if "tokens" in tokenizer_parameters:
 
@@ -795,7 +795,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
         ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys
 
     Returns:
-        `keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
+        `keras.models.Model`: Three lists, one for the layers that were found and successfully restored (from the
         shard file), one for the mismatched layers, and another one for the unexpected layers.
     """
     saved_weight_names_set = set()
@@ -868,7 +868,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
                 f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' "
                 f"at '{resolved_archive_file}'. "
                 "If you tried to load a TF model from a sharded checkpoint, you should try converting the model "
-                "by loading it in pytorch and saving it localy. A convertion script should be realeased soon."
+                "by loading it in pytorch and saving it locally. A convertion script should be released soon."
             )
 
 
@@ -1391,7 +1391,7 @@ def can_generate(cls) -> bool:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
         # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
-        # Alternativelly, the model can also have a custom `generate` function.
+        # Alternatively, the model can also have a custom `generate` function.
         if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
             return False
         return True
 
@@ -1324,7 +1324,7 @@ def _find_mismatched_keys(
                     and state_dict[checkpoint_key].numel() * 2 == model_state_dict[model_key].numel()
                 ):
                     # This skips size mismatches for 4-bit weights. Two 4-bit values share an 8-bit container, causing size differences.
-                    # Without matching with module type or paramter type it seems like a practical way to detect valid 4bit weights.
+                    # Without matching with module type or parameter type it seems like a practical way to detect valid 4bit weights.
                     pass
                 else:
                     mismatched_keys.append(
@@ -1616,7 +1616,7 @@ def _autoset_attn_implementation(
             3. SDPA implementation, if available and supported by the model type. (`LlamaSdpaAttention` for example)
             4. The default model's implementation otherwise (`LlamaAttention` for example) .
         """
-        # Here we use config._attn_implementation_internal to check whether the attention implementation was explicitely set by the user.
+        # Here we use config._attn_implementation_internal to check whether the attention implementation was explicitly set by the user.
         # The property `PretrainedConfig._attn_implementation` is never `None`, for backward compatibility (always fall back on "eager").
         # The `hasattr` here is used as some Transformers tests for some reason do not call PretrainedConfig __init__ (e.g. test_no_super_init_config_and_model)
         requested_attn_implementation = None
@@ -2207,7 +2207,7 @@ def resize_token_embeddings(
         if new_num_tokens is None and pad_to_multiple_of is None:
             return model_embeds
 
-        # Since we are basically resuing the same old embeddings with new weight values, gathering is required
+        # Since we are basically reusing the same old embeddings with new weight values, gathering is required
         is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None
         if is_deepspeed_zero3_enabled() and not is_quantized:
             import deepspeed
@@ -2574,7 +2574,7 @@ def _init_added_embeddings_weights_with_mean(
                 sample_shape=(added_num_tokens,)
             ).to(old_embeddings.weight.dtype)
         else:
-            # Otherwise, just initialize with the mean. because distribtion will not be created.
+            # Otherwise, just initialize with the mean. because distribution will not be created.
             new_embeddings.weight.data[-1 * added_num_tokens :, :] = (
                 mean_embeddings[None, :].repeat(added_num_tokens, 1).to(old_embeddings.weight.dtype)
             )
@@ -2593,7 +2593,7 @@ def _init_added_lm_head_weights_with_mean(
             new_lm_head.weight.data = new_lm_head.weight.data.T
             old_lm_head.weight.data = old_lm_head.weight.data.T
 
-        # The same initilization logic as Embeddings.
+        # The same initialization logic as Embeddings.
         self._init_added_embeddings_weights_with_mean(
             old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens
         )
@@ -2740,7 +2740,7 @@ def gradient_checkpointing_disable(self):
         """
         if self.supports_gradient_checkpointing:
             # For old GC format (transformers < 4.35.0) for models that live on the Hub
-            # we will fall back to the overwritten `_set_gradient_checkpointing` methid
+            # we will fall back to the overwritten `_set_gradient_checkpointing` method
             _is_using_old_format = "value" in inspect.signature(self._set_gradient_checkpointing).parameters
             if not _is_using_old_format:
                 self._set_gradient_checkpointing(enable=False)
@@ -2979,7 +2979,7 @@ def save_pretrained(
                 if ignore_key in state_dict.keys():
                     del state_dict[ignore_key]
 
-        # Rename state_dict keys before saving to file. Do nothing unless overriden in a particular model.
+        # Rename state_dict keys before saving to file. Do nothing unless overridden in a particular model.
         # (initially introduced with TimmWrapperModel to remove prefix and make checkpoints compatible with timm)
         state_dict = self._fix_state_dict_keys_on_save(state_dict)
 
@@ -4998,7 +4998,7 @@ def _load_pretrained_model(
                     shard_file, is_quantized=is_quantized, map_location="meta", weights_only=weights_only
                 )
 
-                # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
+                # Mismatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
                 # matching the weights in the model.
                 mismatched_keys += _find_mismatched_keys(
                     state_dict,
@@ -5321,13 +5321,13 @@ def tensor_parallel(self, device_mesh):
         """
         Tensor parallelize the model across the given device mesh. This function is a helper to be called after the model
         was already loaded in memory, note however that this means that each process will first initialize the whole model,
-        then parallelize it accross devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
+        then parallelize it across devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
 
-        Calling `from_pretrained(..., tp_plan="auto")` is prefered, and will parallelize module-by-module during initialization,
+        Calling `from_pretrained(..., tp_plan="auto")` is preferred, and will parallelize module-by-module during initialization,
         so that the expected per-device memory spike at loading time is not larger than the final model size on each device.
         Tensor parallelize the model across the given device mesh. This function is a helper to be called after the model
         was already loaded in memory, note however that this means that each process will first initialize the whole model,
-        then parallelize it accross devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
+        then parallelize it across devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
 
         Args:
             device_mesh (`torch.distributed.DeviceMesh`):
@@ -5869,7 +5869,7 @@ def unwrap_model(model: nn.Module, recursive: bool = False) -> nn.Module:
 
 def expand_device_map(device_map, param_names, start_prefix):
     """
-    Expand a device map to return the correspondance parameter name to device.
+    Expand a device map to return the correspondence parameter name to device.
     """
     new_device_map = {}
     param_names = [p[len(start_prefix) :] for p in param_names if p.startswith(start_prefix)]
Original file line number	Diff line number	Diff line change
`@@ -1727,5 +1727,5 @@ def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokeni`
`1727`	`1727`	`raise ValueError(`
`1728`	`1728`	`f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path "`
`1729`	`1729`	`f"with a SentencePiece tokenizer.model file."`
`1730`		`- f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}"`
	`1730`	`+ f"Currently available slow->fast converters: {list(SLOW_TO_FAST_CONVERTERS.keys())}"`
`1731`	`1731`	`)`