Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1413,7 +1413,7 @@ def converted(self) -> Tokenizer:
class MoshiConverter(SpmConverter):
handle_byte_fallback = True

def __init__(self, vocab_file, model_max_length=None, **kwargs):
def __init__(self, vocab_file, **kwargs):
requires_backends(self, "protobuf")

Converter.__init__(self, vocab_file)
Expand Down
4 changes: 1 addition & 3 deletions src/transformers/generation/continuous_batching/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
from collections import deque
from math import floor, gcd, sqrt
from typing import Optional, Union
from typing import Optional

import torch

Expand Down Expand Up @@ -123,7 +123,6 @@ def __init__(
generation_config: GenerationConfig,
device: torch.device,
dtype: torch.dtype = torch.float16,
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
tp_size: Optional[int] = None,
) -> None:
"""Initialize a paged attention cache for efficient memory usage.
Expand All @@ -133,7 +132,6 @@ def __init__(
generation_config: Generation configuration containing cache parameters
device: Device for the cache tensors
dtype: Data type of the cache
layer_device_map: Optional mapping of layer indices to devices
tp_size: Tensor parallelism size
"""
self.config = config
Expand Down
10 changes: 4 additions & 6 deletions src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3259,11 +3259,11 @@ def _get_resized_embeddings(

with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
self._init_added_embeddings_weights_with_mean(
old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens
old_embeddings, new_embeddings, old_num_tokens, added_num_tokens
)
else:
self._init_added_embeddings_weights_with_mean(
old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens
old_embeddings, new_embeddings, old_num_tokens, added_num_tokens
)

# Copy token embeddings from the previous weights
Expand Down Expand Up @@ -3433,7 +3433,7 @@ def _get_resized_lm_head(
return new_lm_head

def _init_added_embeddings_weights_with_mean(
self, old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens
self, old_embeddings, new_embeddings, old_num_tokens, added_num_tokens
):
old_embeddings_weight = old_embeddings.weight.data.to(torch.float32)
mean_embeddings = torch.mean(old_embeddings_weight, axis=0)
Expand Down Expand Up @@ -3472,9 +3472,7 @@ def _init_added_lm_head_weights_with_mean(
old_lm_head.weight.data = old_lm_head.weight.data.T

# The same initialization logic as Embeddings.
self._init_added_embeddings_weights_with_mean(
old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens
)
self._init_added_embeddings_weights_with_mean(old_lm_head, new_lm_head, old_num_tokens, added_num_tokens)

if transposed:
# Transpose again to the correct shape.
Expand Down