diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index aa32734ffb38..bc50ebc0a9b7 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -1413,7 +1413,7 @@ def converted(self) -> Tokenizer: class MoshiConverter(SpmConverter): handle_byte_fallback = True - def __init__(self, vocab_file, model_max_length=None, **kwargs): + def __init__(self, vocab_file, **kwargs): requires_backends(self, "protobuf") Converter.__init__(self, vocab_file) diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py index 8d6e057be84a..7a92e0587d8e 100644 --- a/src/transformers/generation/continuous_batching/cache.py +++ b/src/transformers/generation/continuous_batching/cache.py @@ -14,7 +14,7 @@ # limitations under the License. from collections import deque from math import floor, gcd, sqrt -from typing import Optional, Union +from typing import Optional import torch @@ -123,7 +123,6 @@ def __init__( generation_config: GenerationConfig, device: torch.device, dtype: torch.dtype = torch.float16, - layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None, tp_size: Optional[int] = None, ) -> None: """Initialize a paged attention cache for efficient memory usage. @@ -133,7 +132,6 @@ def __init__( generation_config: Generation configuration containing cache parameters device: Device for the cache tensors dtype: Data type of the cache - layer_device_map: Optional mapping of layer indices to devices tp_size: Tensor parallelism size """ self.config = config diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 3f8c14aef5e5..5ec3ac30e613 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -3259,11 +3259,11 @@ def _get_resized_embeddings( with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None): self._init_added_embeddings_weights_with_mean( - old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens + old_embeddings, new_embeddings, old_num_tokens, added_num_tokens ) else: self._init_added_embeddings_weights_with_mean( - old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens + old_embeddings, new_embeddings, old_num_tokens, added_num_tokens ) # Copy token embeddings from the previous weights @@ -3433,7 +3433,7 @@ def _get_resized_lm_head( return new_lm_head def _init_added_embeddings_weights_with_mean( - self, old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens + self, old_embeddings, new_embeddings, old_num_tokens, added_num_tokens ): old_embeddings_weight = old_embeddings.weight.data.to(torch.float32) mean_embeddings = torch.mean(old_embeddings_weight, axis=0) @@ -3472,9 +3472,7 @@ def _init_added_lm_head_weights_with_mean( old_lm_head.weight.data = old_lm_head.weight.data.T # The same initialization logic as Embeddings. - self._init_added_embeddings_weights_with_mean( - old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens - ) + self._init_added_embeddings_weights_with_mean(old_lm_head, new_lm_head, old_num_tokens, added_num_tokens) if transposed: # Transpose again to the correct shape.