Skip to content

Commit 796bae0

Browse files
hmellorDarkLight1337Isotr0pyWoosukKwon
authored
Update transformers to v4.55 (#21931)
Signed-off-by: Harry Mellor <[email protected]> Signed-off-by: DarkLight1337 <[email protected]> Signed-off-by: Isotr0py <[email protected]> Signed-off-by: isotr0py <[email protected]> Signed-off-by: Isotr0py <[email protected]> Co-authored-by: DarkLight1337 <[email protected]> Co-authored-by: Isotr0py <[email protected]> Co-authored-by: Isotr0py <[email protected]> Co-authored-by: Woosuk Kwon <[email protected]>
1 parent 6e20924 commit 796bae0

File tree

13 files changed

+235
-39
lines changed

13 files changed

+235
-39
lines changed

requirements/common.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ requests >= 2.26.0
77
tqdm
88
blake3
99
py-cpuinfo
10-
transformers >= 4.53.2
10+
transformers >= 4.55.0
1111
huggingface-hub[hf_xet] >= 0.33.0 # Required for Xet downloads.
1212
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
1313
protobuf # Required by LlamaTokenizer.

requirements/test.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ opencv-python-headless >= 4.11.0 # required for video test
3535
datamodel_code_generator # required for minicpm3 test
3636
lm-eval[api]==0.4.8 # required for model evaluation test
3737
mteb[bm25s]>=1.38.11, <2 # required for mteb test
38-
transformers==4.53.2
38+
transformers==4.55.0
3939
tokenizers==0.21.1
4040
huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads.
4141
schemathesis>=3.39.15 # Required for openai schema test.

requirements/test.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ fiona==1.10.1
214214
# via torchgeo
215215
flask==3.1.1
216216
# via mlflow
217-
fonttools==4.54.1
217+
fonttools==4.55.0
218218
# via matplotlib
219219
fqdn==1.5.1
220220
# via jsonschema
@@ -286,7 +286,7 @@ httpx==0.27.2
286286
# via
287287
# -r requirements/test.in
288288
# schemathesis
289-
huggingface-hub==0.33.1
289+
huggingface-hub==0.34.3
290290
# via
291291
# -r requirements/test.in
292292
# accelerate
@@ -1148,7 +1148,7 @@ tqdm==4.66.6
11481148
# transformers
11491149
tqdm-multiprocess==0.0.11
11501150
# via lm-eval
1151-
transformers==4.53.2
1151+
transformers==4.55.0
11521152
# via
11531153
# -r requirements/test.in
11541154
# genai-perf

tests/models/multimodal/generation/test_common.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,10 @@
337337
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
338338
num_logprobs=10,
339339
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
340+
# FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we
341+
# should enable this again after the fix is released:
342+
# https://github.com/huggingface/transformers/pull/39915
343+
marks=[pytest.mark.skip("HF model is broken")],
340344
),
341345
"gemma3": VLMTestInfo(
342346
models=["google/gemma-3-4b-it"],

tests/models/registry.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,7 @@ def check_available_online(
179179
min_transformers_version="4.54"),
180180
"Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501
181181
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
182-
"FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
183-
min_transformers_version="4.53"),
182+
"FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"),
184183
"GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
185184
"Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
186185
"Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
@@ -223,7 +222,10 @@ def check_available_online(
223222
trust_remote_code=True),
224223
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
225224
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
226-
extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501
225+
extras={
226+
"tiny": "ai21labs/Jamba-tiny-dev",
227+
"random": "ai21labs/Jamba-tiny-random", # noqa: E501
228+
}),
227229
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
228230
extras={"guard": "meta-llama/Llama-Guard-3-1B", # noqa: E501
229231
"hermes": "NousResearch/Hermes-3-Llama-3.1-8B", # noqa: E501
@@ -239,8 +241,7 @@ def check_available_online(
239241
trust_remote_code=True),
240242
"MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
241243
trust_remote_code=True),
242-
"MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf",
243-
min_transformers_version="4.53"),
244+
"MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf"),
244245
"MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
245246
trust_remote_code=True,
246247
revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"), # noqa: E501
@@ -272,6 +273,8 @@ def check_available_online(
272273
"PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
273274
trust_remote_code=True),
274275
"Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
276+
max_transformers_version="4.53",
277+
transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings", # noqa: E501
275278
trust_remote_code=True),
276279
"QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
277280
trust_remote_code=True),
@@ -299,8 +302,7 @@ def check_available_online(
299302
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
300303
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
301304
trust_remote_code=True),
302-
"Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst",
303-
min_transformers_version="4.53"),
305+
"Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
304306
# [Encoder-decoder]
305307
"BartModel": _HfExamplesInfo("facebook/bart-base"),
306308
"BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
@@ -326,8 +328,12 @@ def check_available_online(
326328
"NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
327329
trust_remote_code=True, v0_only=True), # noqa: E501
328330
"Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
329-
"Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
330-
"Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
331+
"Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B",
332+
max_transformers_version="4.53",
333+
transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"), # noqa: E501
334+
"Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B",
335+
max_transformers_version="4.53",
336+
transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"), # noqa: E501
331337
"RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True), # noqa: E501
332338
"RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True), # noqa: E501
333339
"XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True), # noqa: E501

tests/quantization/test_experts_int8.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
from tests.quantization.utils import is_quant_method_supported
1111

12+
from ..models.registry import HF_EXAMPLE_MODELS
13+
1214
MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
1315

1416

@@ -25,6 +27,8 @@ def test_model_experts_int8_startup(
2527
dtype: str,
2628
max_tokens: int,
2729
) -> None:
30+
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
31+
model_info.check_transformers_version(on_fail="skip")
2832

2933
with vllm_runner(model, dtype=dtype,
3034
quantization="experts_int8") as vllm_model:

vllm/model_executor/models/interfaces_base.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3-
from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
3+
from typing import (TYPE_CHECKING, Any, ClassVar, Literal, Optional, Protocol,
44
Union, overload, runtime_checkable)
55

66
import torch
@@ -14,6 +14,10 @@
1414
from vllm.config import VllmConfig
1515
from vllm.model_executor.layers.pooler import Pooler
1616
from vllm.model_executor.sampling_metadata import SamplingMetadata
17+
else:
18+
VllmConfig = Any
19+
Pooler = Any
20+
SamplingMetadata = Any
1721

1822
logger = init_logger(__name__)
1923

@@ -34,7 +38,7 @@ class VllmModel(Protocol[T_co]):
3438

3539
def __init__(
3640
self,
37-
vllm_config: "VllmConfig",
41+
vllm_config: VllmConfig,
3842
prefix: str = "",
3943
) -> None:
4044
...
@@ -96,7 +100,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
96100
def compute_logits(
97101
self,
98102
hidden_states: T,
99-
sampling_metadata: "SamplingMetadata",
103+
sampling_metadata: SamplingMetadata,
100104
) -> Optional[T]:
101105
"""Return `None` if TP rank > 0."""
102106
...
@@ -140,7 +144,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
140144
MRO of your model class.
141145
"""
142146

143-
pooler: "Pooler"
147+
pooler: Pooler
144148
"""The pooler is only called on TP rank 0."""
145149

146150

vllm/model_executor/models/qwen2_vl.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,11 +1395,12 @@ def __init__(
13951395
**kwargs,
13961396
):
13971397
self.image_processor = Tarsier2ImageProcessor(**vision_config)
1398-
super().__init__(image_processor=self.image_processor,
1399-
tokenizer=tokenizer,
1400-
video_processor=Qwen2VLVideoProcessor(),
1401-
chat_template=None,
1402-
**kwargs)
1398+
super().__init__(
1399+
image_processor=self.image_processor,
1400+
tokenizer=tokenizer,
1401+
video_processor=Qwen2VLVideoProcessor(**vision_config),
1402+
chat_template=None,
1403+
**kwargs)
14031404

14041405

14051406
class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):

vllm/model_executor/models/transformers.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
9090
def replace_linear_class(
9191
linear: nn.Linear, style: Literal["colwise", "rowwise"],
9292
quant_config: QuantizationConfig
93-
) -> Union[ColumnParallelLinear, RowParallelLinear]:
93+
) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
9494
"""
9595
Replace nn.Linear with one of vLLM's tensor parallel linear classes.
9696
@@ -445,7 +445,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
445445

446446
# Set correct attn and init on "meta" to delay allocating GPU tensors
447447
# TODO: @raushan, use the public `model.set_attn_implementation()`
448-
# method after v4.54.0 is released
448+
# method once its checks are fixed in Transformers.
449449
self.text_config._attn_implementation = "vllm"
450450
with init_on_device_without_buffers("meta"), config_override:
451451
self.model: PreTrainedModel = AutoModel.from_config(
@@ -520,7 +520,7 @@ def pipeline_parallel(self):
520520
for i in range(len(layers)):
521521
if start_layer <= i and i < end_layer:
522522
continue
523-
layers[i] = PPMissingLayer(return_tuple=True)
523+
layers[i] = PPMissingLayer()
524524

525525
# Layers after module list
526526
for name in pp_plan[module_list_idx + 1:]:
@@ -533,14 +533,16 @@ def tensor_parallel(self):
533533
Apply the model's tensor parallelization plan.
534534
Currently only supports linear layers.
535535
"""
536-
if not self.model.supports_tp_plan:
537-
if self.tp_size <= 1:
538-
return
536+
tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}
539537

538+
if not tp_plan and self.tp_size > 1:
540539
raise ValueError(
541540
f"{type(self.model)} does not support tensor parallel yet!")
542541

543-
tp_plan = self.model._tp_plan
542+
# Some weight loaders expect linear layers to inherit from vLLM's
543+
# LinearBase class, so we set a default style which causes any
544+
# unspecified linear layers to be replaced with ReplicatedLinear
545+
tp_plan[".*"] = "replicated"
544546

545547
def _tensor_parallel(module: nn.Module, prefix: str = ""):
546548
for child_name, child_module in module.named_children():
@@ -552,6 +554,7 @@ def _tensor_parallel(module: nn.Module, prefix: str = ""):
552554
child_module, style, self.quant_config)
553555
setattr(module, child_name, new_module)
554556
log_replacement(qual_name, child_module, new_module)
557+
break
555558
else:
556559
_tensor_parallel(child_module, prefix=qual_name)
557560

vllm/model_executor/models/utils.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -534,16 +534,10 @@ class PPMissingLayer(torch.nn.Identity):
534534

535535
def __init__(self, *args, **kwargs):
536536
super().__init__()
537-
self.return_tuple = kwargs.get("return_tuple", False)
538537

539538
def forward(self, *args, **kwargs):
540-
"""
541-
Return the first arg from args or the first value from kwargs.
542-
543-
Wraps the input in a tuple if `self.return_tuple` is True.
544-
"""
545-
input = args[0] if args else next(iter(kwargs.values()))
546-
return (input, ) if self.return_tuple else input
539+
"""Return the first arg from args or the first value from kwargs."""
540+
return args[0] if args else next(iter(kwargs.values()))
547541

548542

549543
_CPU_OFFLOAD_BYTES = 0

0 commit comments

Comments
 (0)