Skip to content

Commit 9e84b38

Browse files
chore: enhance message descriptions in parameters,comments,logs and docstrings (#36554)
* chore: enhance message descriptons in parameters,comments,logs and docstrings * chore: enhance message descriptons in parameters,comments,logs and docstrings * Update src/transformers/hf_argparser.py * Update src/transformers/keras_callbacks.py --------- Co-authored-by: Matt <[email protected]>
1 parent 6966fa1 commit 9e84b38

17 files changed

+55
-55
lines changed

src/transformers/configuration_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ class PretrainedConfig(PushToHubMixin):
191191
v5.
192192
loss_type (`str`, *optional*):
193193
The type of loss that the model should use. It should be in `LOSS_MAPPING`'s keys, otherwise the loss will
194-
be automatically infered from the model architecture.
194+
be automatically inferred from the model architecture.
195195
"""
196196

197197
model_type: str = ""
@@ -254,7 +254,7 @@ def __init__(self, **kwargs):
254254
if num_labels is not None and len(self.id2label) != num_labels:
255255
logger.warning(
256256
f"You passed along `num_labels={num_labels}` with an incompatible id to label map: "
257-
f"{self.id2label}. The number of labels wil be overwritten to {self.num_labels}."
257+
f"{self.id2label}. The number of labels will be overwritten to {self.num_labels}."
258258
)
259259
self.id2label = {int(key): value for key, value in self.id2label.items()}
260260
# Keys are always strings in JSON so convert ids to int here.
@@ -1094,7 +1094,7 @@ def _get_non_default_generation_parameters(self) -> Dict[str, Any]:
10941094
is_default_in_config = is_default_generation_value = None
10951095
parameter_value = getattr(self_decoder_config, parameter_name)
10961096
# Three cases in which is okay for the model config to hold generation config parameters:
1097-
# 1. The parameter is set to `None`, effectivelly delegating its value to the generation config
1097+
# 1. The parameter is set to `None`, effectively delegating its value to the generation config
10981098
if parameter_value is None:
10991099
continue
11001100
# 2. If we have a default config, then the instance should hold the same generation defaults

src/transformers/convert_slow_tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1727,5 +1727,5 @@ def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokeni
17271727
raise ValueError(
17281728
f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path "
17291729
f"with a SentencePiece tokenizer.model file."
1730-
f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
1730+
f"Currently available slow->fast converters: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
17311731
)

src/transformers/hf_argparser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
201201
else:
202202
kwargs["required"] = True
203203
elif field.type is bool or field.type == Optional[bool]:
204-
# Copy the currect kwargs to use to instantiate a `no_*` complement argument below.
204+
# Copy the correct kwargs to use to instantiate a `no_*` complement argument below.
205205
# We do not initialize it here because the `no_*` alternative must be instantiated after the real argument
206206
bool_kwargs = copy(kwargs)
207207

src/transformers/image_transforms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,7 @@ def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
585585
586586
center format: contains the coordinate for the center of the box and its width, height dimensions
587587
(center_x, center_y, width, height)
588-
corners format: contains the coodinates for the top-left and bottom-right corners of the box
588+
corners format: contains the coordinates for the top-left and bottom-right corners of the box
589589
(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
590590
"""
591591
# Function is used during model forward pass, so we use the input framework if possible, without

src/transformers/image_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ def default_sample_indices_fn(metadata: VideoMetadata, num_frames=None, fps=None
545545
546546
Args:
547547
metadata (`VideoMetadata`):
548-
`VideoMetadata` object containing metadat about the video, such as "total_num_frames" or "fps".
548+
`VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
549549
num_frames (`int`, *optional*):
550550
Number of frames to sample uniformly.
551551
fps (`int`, *optional*):

src/transformers/modeling_flash_attention_utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,9 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
137137
"""
138138
This function returns necessary arguments to call `flash_attn_varlen_func`.
139139
All three query, key, value states will be flattened.
140-
Cummulative lengths of each examples in the batch will be extracted from position_ids.
140+
Cumulative lengths of each examples in the batch will be extracted from position_ids.
141141
142-
NOTE: ideally cummulative lengths should be prepared at the data collator stage
142+
NOTE: ideally cumulative lengths should be prepared at the data collator stage
143143
144144
Arguments:
145145
query (`torch.Tensor`):
@@ -268,7 +268,7 @@ def _flash_attention_forward(
268268
softmax_scale (`float`, *optional*):
269269
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
270270
use_top_left_mask (`bool`, defaults to `False`):
271-
flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
271+
flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
272272
softcap (`float`, *optional*):
273273
Softcap for the attention logits, used e.g. in gemma2.
274274
deterministic (`bool`, *optional*):
@@ -374,9 +374,9 @@ class FlashAttentionKwargs(TypedDict, total=False):
374374
375375
Attributes:
376376
cu_seq_lens_q (`torch.LongTensor`, *optional*)
377-
Gets cumlative sequence length for query state.
377+
Gets cumulative sequence length for query state.
378378
cu_seq_lens_k (`torch.LongTensor`, *optional*)
379-
Gets cumlative sequence length for key state.
379+
Gets cumulative sequence length for key state.
380380
max_length_q (`int`, *optional*):
381381
Maximum sequence length for query state.
382382
max_length_k (`int`, *optional*):

src/transformers/modeling_flax_utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
367367

368368
def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
369369
r"""
370-
Cast the floating-point `parmas` to `jax.numpy.float32`. This method can be used to explicitly convert the
370+
Cast the floating-point `params` to `jax.numpy.float32`. This method can be used to explicitly convert the
371371
model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in place.
372372
373373
Arguments:
@@ -394,7 +394,7 @@ def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
394394

395395
def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
396396
r"""
397-
Cast the floating-point `parmas` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
397+
Cast the floating-point `params` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
398398
`params` in place.
399399
400400
This method can be used on GPU to explicitly convert the model parameters to float16 precision to do full
@@ -510,7 +510,7 @@ def can_generate(cls) -> bool:
510510
`bool`: Whether this model can generate sequences with `.generate()`.
511511
"""
512512
# Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
513-
# Alternativelly, the model can also have a custom `generate` function.
513+
# Alternatively, the model can also have a custom `generate` function.
514514
if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
515515
return False
516516
return True
@@ -968,7 +968,7 @@ def from_pretrained(
968968
)
969969
cls._missing_keys = missing_keys
970970

971-
# Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
971+
# Mismatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
972972
# matching the weights in the model.
973973
mismatched_keys = []
974974
for key in state.keys():

src/transformers/modeling_gguf_pytorch_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
373373
# to add this patch to ensure things work correctly on our side.
374374
if "llama" in architecture and "mistral" in model_name:
375375
updated_architecture = "mistral"
376-
# FIXME: Currnetly this implementation is only for flan-t5 architecture.
376+
# FIXME: Currently this implementation is only for flan-t5 architecture.
377377
# It needs to be developed for supporting legacy t5.
378378
elif "t5" in architecture or "t5encoder" in architecture:
379379
parsed_parameters["config"]["is_gated_act"] = True
@@ -437,7 +437,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
437437
logger.info(f"Some keys were not parsed and added into account {gguf_key} | {value}")
438438

439439
# retrieve config vocab_size from tokenizer
440-
# Pleas refer to https://github.com/huggingface/transformers/issues/32526 for more details
440+
# Please refer to https://github.com/huggingface/transformers/issues/32526 for more details
441441
if "vocab_size" not in parsed_parameters["config"]:
442442
tokenizer_parameters = parsed_parameters["tokenizer"]
443443
if "tokens" in tokenizer_parameters:

src/transformers/modeling_tf_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -795,7 +795,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
795795
ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys
796796
797797
Returns:
798-
`keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
798+
`keras.models.Model`: Three lists, one for the layers that were found and successfully restored (from the
799799
shard file), one for the mismatched layers, and another one for the unexpected layers.
800800
"""
801801
saved_weight_names_set = set()
@@ -868,7 +868,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
868868
f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' "
869869
f"at '{resolved_archive_file}'. "
870870
"If you tried to load a TF model from a sharded checkpoint, you should try converting the model "
871-
"by loading it in pytorch and saving it localy. A convertion script should be realeased soon."
871+
"by loading it in pytorch and saving it locally. A convertion script should be released soon."
872872
)
873873

874874

@@ -1391,7 +1391,7 @@ def can_generate(cls) -> bool:
13911391
`bool`: Whether this model can generate sequences with `.generate()`.
13921392
"""
13931393
# Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
1394-
# Alternativelly, the model can also have a custom `generate` function.
1394+
# Alternatively, the model can also have a custom `generate` function.
13951395
if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
13961396
return False
13971397
return True

src/transformers/modeling_utils.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,7 +1324,7 @@ def _find_mismatched_keys(
13241324
and state_dict[checkpoint_key].numel() * 2 == model_state_dict[model_key].numel()
13251325
):
13261326
# This skips size mismatches for 4-bit weights. Two 4-bit values share an 8-bit container, causing size differences.
1327-
# Without matching with module type or paramter type it seems like a practical way to detect valid 4bit weights.
1327+
# Without matching with module type or parameter type it seems like a practical way to detect valid 4bit weights.
13281328
pass
13291329
else:
13301330
mismatched_keys.append(
@@ -1616,7 +1616,7 @@ def _autoset_attn_implementation(
16161616
3. SDPA implementation, if available and supported by the model type. (`LlamaSdpaAttention` for example)
16171617
4. The default model's implementation otherwise (`LlamaAttention` for example) .
16181618
"""
1619-
# Here we use config._attn_implementation_internal to check whether the attention implementation was explicitely set by the user.
1619+
# Here we use config._attn_implementation_internal to check whether the attention implementation was explicitly set by the user.
16201620
# The property `PretrainedConfig._attn_implementation` is never `None`, for backward compatibility (always fall back on "eager").
16211621
# The `hasattr` here is used as some Transformers tests for some reason do not call PretrainedConfig __init__ (e.g. test_no_super_init_config_and_model)
16221622
requested_attn_implementation = None
@@ -2207,7 +2207,7 @@ def resize_token_embeddings(
22072207
if new_num_tokens is None and pad_to_multiple_of is None:
22082208
return model_embeds
22092209

2210-
# Since we are basically resuing the same old embeddings with new weight values, gathering is required
2210+
# Since we are basically reusing the same old embeddings with new weight values, gathering is required
22112211
is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None
22122212
if is_deepspeed_zero3_enabled() and not is_quantized:
22132213
import deepspeed
@@ -2574,7 +2574,7 @@ def _init_added_embeddings_weights_with_mean(
25742574
sample_shape=(added_num_tokens,)
25752575
).to(old_embeddings.weight.dtype)
25762576
else:
2577-
# Otherwise, just initialize with the mean. because distribtion will not be created.
2577+
# Otherwise, just initialize with the mean. because distribution will not be created.
25782578
new_embeddings.weight.data[-1 * added_num_tokens :, :] = (
25792579
mean_embeddings[None, :].repeat(added_num_tokens, 1).to(old_embeddings.weight.dtype)
25802580
)
@@ -2593,7 +2593,7 @@ def _init_added_lm_head_weights_with_mean(
25932593
new_lm_head.weight.data = new_lm_head.weight.data.T
25942594
old_lm_head.weight.data = old_lm_head.weight.data.T
25952595

2596-
# The same initilization logic as Embeddings.
2596+
# The same initialization logic as Embeddings.
25972597
self._init_added_embeddings_weights_with_mean(
25982598
old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens
25992599
)
@@ -2740,7 +2740,7 @@ def gradient_checkpointing_disable(self):
27402740
"""
27412741
if self.supports_gradient_checkpointing:
27422742
# For old GC format (transformers < 4.35.0) for models that live on the Hub
2743-
# we will fall back to the overwritten `_set_gradient_checkpointing` methid
2743+
# we will fall back to the overwritten `_set_gradient_checkpointing` method
27442744
_is_using_old_format = "value" in inspect.signature(self._set_gradient_checkpointing).parameters
27452745
if not _is_using_old_format:
27462746
self._set_gradient_checkpointing(enable=False)
@@ -2979,7 +2979,7 @@ def save_pretrained(
29792979
if ignore_key in state_dict.keys():
29802980
del state_dict[ignore_key]
29812981

2982-
# Rename state_dict keys before saving to file. Do nothing unless overriden in a particular model.
2982+
# Rename state_dict keys before saving to file. Do nothing unless overridden in a particular model.
29832983
# (initially introduced with TimmWrapperModel to remove prefix and make checkpoints compatible with timm)
29842984
state_dict = self._fix_state_dict_keys_on_save(state_dict)
29852985

@@ -4998,7 +4998,7 @@ def _load_pretrained_model(
49984998
shard_file, is_quantized=is_quantized, map_location="meta", weights_only=weights_only
49994999
)
50005000

5001-
# Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
5001+
# Mismatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
50025002
# matching the weights in the model.
50035003
mismatched_keys += _find_mismatched_keys(
50045004
state_dict,
@@ -5321,13 +5321,13 @@ def tensor_parallel(self, device_mesh):
53215321
"""
53225322
Tensor parallelize the model across the given device mesh. This function is a helper to be called after the model
53235323
was already loaded in memory, note however that this means that each process will first initialize the whole model,
5324-
then parallelize it accross devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
5324+
then parallelize it across devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
53255325
5326-
Calling `from_pretrained(..., tp_plan="auto")` is prefered, and will parallelize module-by-module during initialization,
5326+
Calling `from_pretrained(..., tp_plan="auto")` is preferred, and will parallelize module-by-module during initialization,
53275327
so that the expected per-device memory spike at loading time is not larger than the final model size on each device.
53285328
Tensor parallelize the model across the given device mesh. This function is a helper to be called after the model
53295329
was already loaded in memory, note however that this means that each process will first initialize the whole model,
5330-
then parallelize it accross devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
5330+
then parallelize it across devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
53315331
53325332
Args:
53335333
device_mesh (`torch.distributed.DeviceMesh`):
@@ -5869,7 +5869,7 @@ def unwrap_model(model: nn.Module, recursive: bool = False) -> nn.Module:
58695869

58705870
def expand_device_map(device_map, param_names, start_prefix):
58715871
"""
5872-
Expand a device map to return the correspondance parameter name to device.
5872+
Expand a device map to return the correspondence parameter name to device.
58735873
"""
58745874
new_device_map = {}
58755875
param_names = [p[len(start_prefix) :] for p in param_names if p.startswith(start_prefix)]

0 commit comments

Comments
 (0)