Skip to content

Commit 3f7cede

Browse files
authored
Update transformers to 4.53.0 (NVIDIA#5747)
Signed-off-by: Hao Lu <14827759+hlu1@users.noreply.github.com> Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
1 parent 74dca0a commit 3f7cede

File tree

10 files changed

+66
-40
lines changed

10 files changed

+66
-40
lines changed

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ torchvision
2828
nvidia-modelopt[torch]~=0.31.0
2929
nvidia-nccl-cu12
3030
nvidia-cuda-nvrtc-cu12
31-
transformers~=4.51.1
31+
transformers==4.53.1
3232
pydantic>=2.9.1
3333
pydantic-settings
3434
pillow==10.3.0

tensorrt_llm/_torch/attention_backend/interface.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -359,8 +359,9 @@ def from_config(config) -> "RopeParams":
359359
# get rotary parameters.
360360
hidden_size = config.hidden_size
361361
num_attention_heads = config.num_attention_heads
362-
head_dim = getattr(config, 'head_dim',
363-
hidden_size // num_attention_heads)
362+
head_dim = getattr(config, 'head_dim', None)
363+
if not isinstance(head_dim, int):
364+
head_dim = hidden_size // num_attention_heads
364365
rope_scaling = getattr(config, 'rope_scaling', None)
365366
rope_params.max_positions = config.max_position_embeddings
366367
rope_params.theta = getattr(config, 'rope_theta', 10000.0)

tensorrt_llm/_torch/models/modeling_gemma3vl.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,3 @@ def forward(
181181
logits = self.llm.forward(attn_metadata, input_ids, position_ids,
182182
inputs_embeds, return_context_logits)
183183
return logits
184-
185-
186-
AutoModel.register(Gemma3Config, Gemma3Model)

tensorrt_llm/_torch/models/modeling_llava_next.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,3 @@ def forward(
287287
logits = self.llm.forward(attn_metadata, input_ids, position_ids,
288288
inputs_embeds, return_context_logits)
289289
return logits
290-
291-
292-
AutoModel.register(LlavaNextConfig, LlavaNextModel)

tensorrt_llm/_torch/models/modeling_qwen2vl.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,8 @@ def get_rope_index(
179179
# Calculate temporal position IDs based on model type
180180
if hasattr(model_config.vision_config, 'tokens_per_second'):
181181
# Qwen2_5_VL style temporal position calculation
182+
if isinstance(second_per_grid_t, torch.Tensor):
183+
second_per_grid_t = second_per_grid_t.item()
182184
range_tensor = torch.arange(llm_grid_t).view(-1, 1)
183185
expanded_range = range_tensor.expand(
184186
-1, llm_grid_h * llm_grid_w)
@@ -273,6 +275,8 @@ def _preprocess(self, text: dict[str, any], mm_data: dict[str, any],
273275
do_rescale = False
274276
if videos and isinstance(videos[0][0], torch.Tensor):
275277
do_rescale = False
278+
# transformers=4.53.1 does not support GPU video tensors in Qwen2VL processor.
279+
videos = [[frame.to("cpu") for frame in video] for video in videos]
276280
return self.processor(text=[text],
277281
images=images,
278282
videos=videos,

tensorrt_llm/_torch/modules/attention.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,9 @@ def __init__(
6767
config = config or ModelConfig()
6868
self.hidden_size = hidden_size
6969
self.num_heads = num_attention_heads
70-
self.head_dim = getattr(config.pretrained_config, "head_dim",
71-
self.hidden_size // self.num_heads)
70+
self.head_dim = getattr(config.pretrained_config, 'head_dim', None)
71+
if not isinstance(self.head_dim, int):
72+
self.head_dim = self.hidden_size // self.num_heads
7273
self.num_key_value_heads = num_key_value_heads
7374
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
7475
self.max_position_embeddings = max_position_embeddings

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,10 @@ def _get_cache_size_per_token(model_config: ModelConfig,
7575
head_dim = config.kv_lora_rank + config.qk_rope_head_dim
7676
kv_factor = 1
7777
else:
78-
head_dim = getattr(
79-
config,
80-
"head_dim",
81-
config.hidden_size // config.num_attention_heads,
82-
) * num_key_value_heads // tp_size
78+
_head_dim = getattr(config, 'head_dim', None)
79+
if not isinstance(_head_dim, int):
80+
_head_dim = config.hidden_size // config.num_attention_heads
81+
head_dim = _head_dim * num_key_value_heads // tp_size
8382

8483
# provide at least 1 layer to prevent division by zero cache size
8584
num_attention_layers = max(
@@ -281,8 +280,9 @@ def _create_kv_cache_manager(
281280
num_attention_heads = config.num_attention_heads
282281
num_key_value_heads = getattr(config, 'num_key_value_heads',
283282
num_attention_heads)
284-
head_dim = getattr(config, "head_dim",
285-
hidden_size // num_attention_heads)
283+
head_dim = getattr(config, "head_dim", None)
284+
if not isinstance(head_dim, int):
285+
head_dim = hidden_size // num_attention_heads
286286

287287
if quant_config is not None and quant_config.quant_mode.has_fp8_kv_cache(
288288
):

tests/unittest/_torch/modeling/test_modeling_mllama.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
import unittest
23
from copy import deepcopy
34

@@ -255,6 +256,24 @@
255256
}
256257

257258

259+
def convert_weights_names(weights: dict) -> dict:
260+
# Since transformers version >= 4.52.0, the default model architecture is changed.
261+
# We need to convert the weight names accordingly to match TRTLLM naming.
262+
_checkpoint_conversion_mapping = {
263+
"^model.language_model": "language_model.model",
264+
"^model.vision_model": "vision_model",
265+
"^model.multi_modal_projector": "multi_modal_projector",
266+
"^lm_head": "language_model.lm_head",
267+
}
268+
converted_weights = {}
269+
for weight_name, weight_value in weights.items():
270+
new_name = weight_name
271+
for pattern, replacement in _checkpoint_conversion_mapping.items():
272+
new_name = re.sub(pattern, replacement, new_name)
273+
converted_weights[new_name] = weight_value
274+
return converted_weights
275+
276+
258277
class TestMLlama(unittest.TestCase):
259278

260279
@parameterized.expand([
@@ -301,7 +320,8 @@ def test_mllama_allclose_to_hf_text_only(self, scenario: Scenario) -> None:
301320
mllama = MllamaForConditionalGeneration(
302321
ModelConfig(pretrained_config=mllama_config,
303322
attn_backend=backend)).to(dtype).to(device)
304-
mllama.load_weights(hf_mllama.state_dict())
323+
weights = convert_weights_names(hf_mllama.state_dict())
324+
mllama.load_weights(weights)
305325

306326
# KV cache setup
307327
num_blocks = 1

tests/unittest/trt/attention/test_gpt_attention.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1230,11 +1230,14 @@ def verify_kv_cache(torch_present):
12301230
else:
12311231
attention_packed_mask = None
12321232
if attention_type == 'gpt2_attention':
1233-
torch_output, torch_present = attention(
1234-
input_tensor,
1235-
layer_past=None,
1236-
use_cache=True,
1237-
attention_mask=attention_mask)
1233+
# gpt2 uses DynamicCache
1234+
torch_present = DynamicCache.from_legacy_cache(
1235+
torch_present)
1236+
torch_output = attention(input_tensor,
1237+
past_key_value=torch_present,
1238+
use_cache=True,
1239+
attention_mask=attention_mask)[0]
1240+
torch_present = torch_present.to_legacy_cache()
12381241
elif attention_type == 'llama_attention':
12391242
position_embeddings = rotary_emb(input_tensor, position_ids)
12401243
attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask(
@@ -1277,7 +1280,7 @@ def verify_kv_cache(torch_present):
12771280

12781281
torch.cuda.synchronize()
12791282

1280-
if attention_type == 'llama_attention':
1283+
if attention_type in ['llama_attention', 'gpt2_attention']:
12811284
kv_dequant_scale, kv_quant_scale = get_kv_quant_scale(
12821285
torch_present[0])
12831286
else:
@@ -1322,7 +1325,7 @@ def verify_kv_cache(torch_present):
13221325
torch_output[:, :in_len // 2, :].to(
13231326
torch.float32).cpu().numpy(),
13241327
atol=5e-3)
1325-
if attention_type == 'llama_attention':
1328+
if attention_type in ['llama_attention', 'gpt2_attention']:
13261329
verify_kv_cache(torch_present[0])
13271330
else:
13281331
verify_kv_cache(torch_present)
@@ -1374,11 +1377,14 @@ def verify_kv_cache(torch_present):
13741377

13751378
# torch execution
13761379
if attention_type == 'gpt2_attention':
1377-
torch_output, torch_present = attention(
1378-
input_tensor,
1379-
layer_past=torch_present,
1380-
use_cache=True,
1381-
attention_mask=attention_mask)
1380+
# gpt2 uses DynamicCache
1381+
torch_present = DynamicCache.from_legacy_cache(
1382+
torch_present)
1383+
torch_output = attention(input_tensor,
1384+
past_key_value=torch_present,
1385+
use_cache=True,
1386+
attention_mask=attention_mask)[0]
1387+
torch_present = torch_present.to_legacy_cache()
13821388
elif attention_type == 'llama_attention':
13831389
position_embeddings = rotary_emb(input_tensor, position_ids)
13841390
attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask(

tests/unittest/trt/attention/test_gpt_attention_IFB.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -754,11 +754,11 @@ def torch_exec(step: int,
754754
dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype),
755755
tgt_len=(in_len if step == 0 else 1))
756756
if attention_type == 'gpt2_attention':
757-
torch_output, torch_present = attention(
758-
input,
759-
layer_past=layer_past,
760-
use_cache=True,
761-
attention_mask=attention_mask)
757+
torch_output = attention(input,
758+
past_key_value=layer_past,
759+
use_cache=True,
760+
attention_mask=attention_mask)[0]
761+
torch_present = layer_past
762762
elif attention_type == 'llama_attention':
763763
position_embeddings = rotary_emb(input, position_ids)
764764
attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask(
@@ -1003,8 +1003,8 @@ def torch_exec(step: int,
10031003
torch_in = input_tensor[:, offset:offset_next, :].reshape(
10041004
(local_beam_width, input_length, hidden_size))
10051005

1006-
# llama uses DynamicCache
1007-
if attention_type == 'llama_attention':
1006+
# llama/gpt2 uses DynamicCache
1007+
if attention_type in ['llama_attention', 'gpt2_attention']:
10081008
past_key_values = DynamicCache.from_legacy_cache(
10091009
torch_cache_list[req_idx])
10101010
else:
@@ -1014,8 +1014,8 @@ def torch_exec(step: int,
10141014
step, torch_in, ctx_attention_mask_list[req_idx], req_idx,
10151015
past_key_values)
10161016

1017-
# llama uses DynamicCache
1018-
if attention_type == 'llama_attention':
1017+
# llama/gpt2 uses DynamicCache
1018+
if attention_type in ['llama_attention', 'gpt2_attention']:
10191019
torch_cache_list[req_idx] = past_key_values.to_legacy_cache(
10201020
)
10211021
past_key_values = torch_cache_list[req_idx][0]

0 commit comments

Comments
 (0)