Skip to content

Commit e6fdedc

Browse files
authored
Fix Qwen2-VL and Qwen2.5-VL config attribute access for Transformers v5 (#1062)
Fixes #1012 ⚠️ Dependency: This PR depends on #1060. Please review and merge #1060 first. ## Summary - Fix `AttributeError: 'Qwen2VLConfig' object has no attribute 'hidden_size'` for Qwen2-VL and Qwen2.5-VL models - Update test configurations to use the new `text_config` structure required by Transformers v5 ## Changes 1. **Model code** (`src/liger_kernel/transformers/model/qwen2_vl.py`, `qwen2_5_vl.py`): - Changed `self.config.hidden_size` → `self.config.text_config.hidden_size` - Changed `self.config.vocab_size` → `self.config.text_config.vocab_size` 2. **Test configurations** (`test/convergence/bf16/test_mini_models.py`, `fp32/test_mini_models.py`): - Restructured `mini_qwen2_vl` and `mini_qwen2_5_vl` configurations to use `text_config` dictionary for text-related parameters ## Background In Transformers v5, `Qwen2VLConfig` and `Qwen2_5_VLConfig` moved text-related parameters (such as `hidden_size`, `vocab_size`) into a nested `text_config` attribute, following the pattern used by other multimodal models. ## Test plan - [x] `python -m pytest test/convergence/bf16/test_mini_models.py -k "qwen2_vl or qwen2_5_vl"` passes - [x] `python -m pytest test/convergence/fp32/test_mini_models.py -k "qwen2_vl or qwen2_5_vl"` passes
1 parent 2d23852 commit e6fdedc

File tree

4 files changed

+144
-104
lines changed

4 files changed

+144
-104
lines changed

src/liger_kernel/transformers/model/qwen2_5_vl.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,30 @@
55

66
import torch
77

8+
from packaging import version
9+
from transformers import __version__ as transformers_version
810
from transformers.utils import can_return_tuple
911

1012
from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
1113
from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
1214
from liger_kernel.transformers.model.output_classes import LigerQwen2_5_VLCausalLMOutputWithPast
1315

16+
_TRANSFORMERS_V5_OR_LATER = version.parse(transformers_version) >= version.parse("5.0.0")
17+
18+
19+
def _get_hidden_size(config) -> int:
20+
"""Get hidden_size from Qwen2.5VLConfig in a version-aware manner."""
21+
if _TRANSFORMERS_V5_OR_LATER:
22+
return config.text_config.hidden_size
23+
return config.hidden_size
24+
25+
26+
def _get_vocab_size(config) -> int:
27+
"""Get vocab_size from Qwen2.5VLConfig in a version-aware manner."""
28+
if _TRANSFORMERS_V5_OR_LATER:
29+
return config.text_config.vocab_size
30+
return config.vocab_size
31+
1432

1533
@can_return_tuple
1634
def lce_forward(
@@ -129,7 +147,7 @@ def lce_forward(
129147
lm_head_weight=self.lm_head.weight,
130148
labels=labels,
131149
shift_labels=shift_labels,
132-
hidden_size=self.config.hidden_size,
150+
hidden_size=_get_hidden_size(self.config),
133151
**kwargs,
134152
)
135153
loss, _, token_accuracy = unpack_cross_entropy_result(result)
@@ -142,7 +160,7 @@ def lce_forward(
142160
logits=logits,
143161
labels=labels,
144162
shift_labels=shift_labels,
145-
vocab_size=self.config.vocab_size,
163+
vocab_size=_get_vocab_size(self.config),
146164
)
147165

148166
if not return_dict:

src/liger_kernel/transformers/model/qwen2_vl.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,30 @@
55

66
import torch
77

8+
from packaging import version
9+
from transformers import __version__ as transformers_version
810
from transformers.utils import can_return_tuple
911

1012
from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
1113
from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
1214
from liger_kernel.transformers.model.output_classes import LigerQwen2VLCausalLMOutputWithPast
1315

16+
_TRANSFORMERS_V5_OR_LATER = version.parse(transformers_version) >= version.parse("5.0.0")
17+
18+
19+
def _get_hidden_size(config) -> int:
20+
"""Get hidden_size from Qwen2VLConfig in a version-aware manner."""
21+
if _TRANSFORMERS_V5_OR_LATER:
22+
return config.text_config.hidden_size
23+
return config.hidden_size
24+
25+
26+
def _get_vocab_size(config) -> int:
27+
"""Get vocab_size from Qwen2VLConfig in a version-aware manner."""
28+
if _TRANSFORMERS_V5_OR_LATER:
29+
return config.text_config.vocab_size
30+
return config.vocab_size
31+
1432

1533
@can_return_tuple
1634
def lce_forward(
@@ -125,7 +143,7 @@ def lce_forward(
125143
lm_head_weight=self.lm_head.weight,
126144
labels=labels,
127145
shift_labels=shift_labels,
128-
hidden_size=self.config.hidden_size,
146+
hidden_size=_get_hidden_size(self.config),
129147
**kwargs,
130148
)
131149
loss, _, token_accuracy = unpack_cross_entropy_result(result)
@@ -138,7 +156,7 @@ def lce_forward(
138156
logits=logits,
139157
labels=labels,
140158
shift_labels=shift_labels,
141-
vocab_size=self.config.vocab_size,
159+
vocab_size=_get_vocab_size(self.config),
142160
)
143161

144162
if not return_dict:

test/convergence/bf16/test_mini_models.py

Lines changed: 52 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -711,34 +711,36 @@
711711
liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_vl,
712712
model_class=Qwen2VLForConditionalGeneration,
713713
mini_model_config=Qwen2VLConfig(
714-
attention_dropout=0.0,
715-
# bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
716-
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
717-
bos_token_id=1, # 151643
718-
eos_token_id=2, # 151645
714+
# In transformers v5, text-related parameters must be in text_config
715+
text_config={
716+
"attention_dropout": 0.0,
717+
# bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
718+
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
719+
"bos_token_id": 1, # 151643
720+
"eos_token_id": 2, # 151645
721+
"hidden_act": "silu",
722+
"hidden_size": 1536, # 8192
723+
"initializer_range": 0.02,
724+
"intermediate_size": 4864, # 29568
725+
"max_position_embeddings": 32768,
726+
"max_window_layers": 4, # 80
727+
"num_attention_heads": 12, # 64
728+
"num_hidden_layers": 4, # 80
729+
"num_key_value_heads": 2, # 8
730+
"rms_norm_eps": 1e-6, # 1e-5
731+
"rope_parameters": {
732+
"mrope_section": [16, 24, 24], # (temporal, height, width)
733+
},
734+
"sliding_window": 4096,
735+
"tie_word_embeddings": False,
736+
"use_cache": True,
737+
"vocab_size": 32768, # 152064 # >32k, Mistral-7B tokenizer vocab size
738+
"use_sliding_window": False,
739+
},
719740
vision_start_token_id=32765, # vocab_size - 5
720741
vision_end_token_id=32766, # vocab_size - 4
721-
vision_token_id=32767, # vocab_size - 3
722742
image_token_id=32768, # vocab_size - 2
723743
video_token_id=32769, # vocab_size - 1
724-
hidden_act="silu",
725-
hidden_size=1536, # 8192
726-
initializer_range=0.02,
727-
intermediate_size=4864, # 29568
728-
max_position_embeddings=32768,
729-
max_window_layers=4, # 80
730-
num_attention_heads=12, # 64
731-
num_hidden_layers=4, # 80
732-
num_key_value_heads=2, # 8
733-
rms_norm_eps=1e-6, # 1e-5
734-
rope_parameters=dict(
735-
mrope_section=[16, 24, 24], # (temporal, height, width)
736-
),
737-
sliding_window=4096,
738-
tie_word_embeddings=False,
739-
use_cache=True,
740-
vocab_size=32768, # 152064 # >32k, Mistral-7B tokenizer vocab size
741-
use_sliding_window=False,
742744
vision_config={
743745
"depth": 4, # 32
744746
"embed_dim": 1280,
@@ -751,7 +753,6 @@
751753
"spatial_patch_size": 14,
752754
"temporal_patch_size": 2,
753755
},
754-
attn_implementation="sdpa",
755756
),
756757
)
757758

@@ -761,34 +762,36 @@
761762
liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_5_vl,
762763
model_class=Qwen2_5_VLForConditionalGeneration,
763764
mini_model_config=Qwen2_5_VLConfig(
764-
attention_dropout=0.0,
765-
# bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
766-
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
767-
bos_token_id=1, # 151643
768-
eos_token_id=2, # 151645
765+
# In transformers v5, text-related parameters must be in text_config
766+
text_config={
767+
"attention_dropout": 0.0,
768+
# bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
769+
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
770+
"bos_token_id": 1, # 151643
771+
"eos_token_id": 2, # 151645
772+
"hidden_act": "silu",
773+
"hidden_size": 1536, # 8192
774+
"initializer_range": 0.02,
775+
"intermediate_size": 4864, # 29568
776+
"max_position_embeddings": 32768,
777+
"max_window_layers": 4, # 80
778+
"num_attention_heads": 12, # 64
779+
"num_hidden_layers": 4, # 80
780+
"num_key_value_heads": 2, # 8
781+
"rms_norm_eps": 1e-6, # 1e-5
782+
"rope_parameters": {
783+
"mrope_section": [16, 24, 24], # (temporal, height, width)
784+
},
785+
"sliding_window": 4096,
786+
"tie_word_embeddings": False,
787+
"use_cache": True,
788+
"vocab_size": 32768, # 152064 # >32k, Mistral-7B tokenizer vocab size
789+
"use_sliding_window": False,
790+
},
769791
vision_start_token_id=32765, # vocab_size - 5
770792
vision_end_token_id=32766, # vocab_size - 4
771-
vision_token_id=32767, # vocab_size - 3
772793
image_token_id=32768, # vocab_size - 2
773794
video_token_id=32769, # vocab_size - 1
774-
hidden_act="silu",
775-
hidden_size=1536, # 8192
776-
initializer_range=0.02,
777-
intermediate_size=4864, # 29568
778-
max_position_embeddings=32768,
779-
max_window_layers=4, # 80
780-
num_attention_heads=12, # 64
781-
num_hidden_layers=4, # 80
782-
num_key_value_heads=2, # 8
783-
rms_norm_eps=1e-6, # 1e-5
784-
rope_parameters=dict(
785-
mrope_section=[16, 24, 24], # (temporal, height, width)
786-
),
787-
sliding_window=4096,
788-
tie_word_embeddings=False,
789-
use_cache=True,
790-
vocab_size=32768, # 152064 # >32k, Mistral-7B tokenizer vocab size
791-
use_sliding_window=False,
792795
vision_config={
793796
"depth": 4, # 32
794797
"hidden_act": "silu",
@@ -805,7 +808,6 @@
805808
"tokens_per_second": 2,
806809
"temporal_patch_size": 2,
807810
},
808-
attn_implementation="sdpa",
809811
),
810812
)
811813

test/convergence/fp32/test_mini_models.py

Lines changed: 52 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -708,34 +708,36 @@
708708
liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_vl,
709709
model_class=Qwen2VLForConditionalGeneration,
710710
mini_model_config=Qwen2VLConfig(
711-
attention_dropout=0.0,
712-
# bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
713-
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
714-
bos_token_id=1, # 151643
715-
eos_token_id=2, # 151645
711+
# In transformers v5, text-related parameters must be in text_config
712+
text_config={
713+
"attention_dropout": 0.0,
714+
# bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
715+
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
716+
"bos_token_id": 1, # 151643
717+
"eos_token_id": 2, # 151645
718+
"hidden_act": "silu",
719+
"hidden_size": 1536, # 8192
720+
"initializer_range": 0.02,
721+
"intermediate_size": 4864, # 29568
722+
"max_position_embeddings": 32768,
723+
"max_window_layers": 4, # 80
724+
"num_attention_heads": 12, # 64
725+
"num_hidden_layers": 4, # 80
726+
"num_key_value_heads": 2, # 8
727+
"rms_norm_eps": 1e-6, # 1e-5
728+
"rope_parameters": {
729+
"mrope_section": [16, 24, 24], # (temporal, height, width)
730+
},
731+
"sliding_window": 4096,
732+
"tie_word_embeddings": False,
733+
"use_cache": True,
734+
"vocab_size": 32768, # 152064 # >32k, Mistral-7B tokenizer vocab size
735+
"use_sliding_window": False,
736+
},
716737
vision_start_token_id=32765, # vocab_size - 5
717738
vision_end_token_id=32766, # vocab_size - 4
718-
vision_token_id=32767, # vocab_size - 3
719739
image_token_id=32768, # vocab_size - 2
720740
video_token_id=32769, # vocab_size - 1
721-
hidden_act="silu",
722-
hidden_size=1536, # 8192
723-
initializer_range=0.02,
724-
intermediate_size=4864, # 29568
725-
max_position_embeddings=32768,
726-
max_window_layers=4, # 80
727-
num_attention_heads=12, # 64
728-
num_hidden_layers=4, # 80
729-
num_key_value_heads=2, # 8
730-
rms_norm_eps=1e-6, # 1e-5
731-
rope_parameters=dict(
732-
mrope_section=[16, 24, 24], # (temporal, height, width)
733-
),
734-
sliding_window=4096,
735-
tie_word_embeddings=False,
736-
use_cache=True,
737-
vocab_size=32768, # 152064 # >32k, Mistral-7B tokenizer vocab size
738-
use_sliding_window=False,
739741
vision_config={
740742
"depth": 4, # 32
741743
"embed_dim": 1280,
@@ -748,7 +750,6 @@
748750
"spatial_patch_size": 14,
749751
"temporal_patch_size": 2,
750752
},
751-
attn_implementation="sdpa",
752753
),
753754
)
754755

@@ -758,34 +759,36 @@
758759
liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_5_vl,
759760
model_class=Qwen2_5_VLForConditionalGeneration,
760761
mini_model_config=Qwen2_5_VLConfig(
761-
attention_dropout=0.0,
762-
# bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
763-
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
764-
bos_token_id=1, # 151643
765-
eos_token_id=2, # 151645
762+
# In transformers v5, text-related parameters must be in text_config
763+
text_config={
764+
"attention_dropout": 0.0,
765+
# bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
766+
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
767+
"bos_token_id": 1, # 151643
768+
"eos_token_id": 2, # 151645
769+
"hidden_act": "silu",
770+
"hidden_size": 1536, # 8192
771+
"initializer_range": 0.02,
772+
"intermediate_size": 4864, # 29568
773+
"max_position_embeddings": 32768,
774+
"max_window_layers": 4, # 80
775+
"num_attention_heads": 12, # 64
776+
"num_hidden_layers": 4, # 80
777+
"num_key_value_heads": 2, # 8
778+
"rms_norm_eps": 1e-6, # 1e-5
779+
"rope_parameters": {
780+
"mrope_section": [16, 24, 24], # (temporal, height, width)
781+
},
782+
"sliding_window": 4096,
783+
"tie_word_embeddings": False,
784+
"use_cache": True,
785+
"vocab_size": 32768, # 152064 # >32k, Mistral-7B tokenizer vocab size
786+
"use_sliding_window": False,
787+
},
766788
vision_start_token_id=32765, # vocab_size - 5
767789
vision_end_token_id=32766, # vocab_size - 4
768-
vision_token_id=32767, # vocab_size - 3
769790
image_token_id=32768, # vocab_size - 2
770791
video_token_id=32769, # vocab_size - 1
771-
hidden_act="silu",
772-
hidden_size=1536, # 8192
773-
initializer_range=0.02,
774-
intermediate_size=4864, # 29568
775-
max_position_embeddings=32768,
776-
max_window_layers=4, # 80
777-
num_attention_heads=12, # 64
778-
num_hidden_layers=4, # 80
779-
num_key_value_heads=2, # 8
780-
rms_norm_eps=1e-6, # 1e-5
781-
rope_parameters=dict(
782-
mrope_section=[16, 24, 24], # (temporal, height, width)
783-
),
784-
sliding_window=4096,
785-
tie_word_embeddings=False,
786-
use_cache=True,
787-
vocab_size=32768, # 152064 # >32k, Mistral-7B tokenizer vocab size
788-
use_sliding_window=False,
789792
vision_config={
790793
"depth": 4, # 32
791794
"hidden_act": "silu",
@@ -802,7 +805,6 @@
802805
"tokens_per_second": 2,
803806
"temporal_patch_size": 2,
804807
},
805-
attn_implementation="sdpa",
806808
),
807809
)
808810

0 commit comments

Comments
 (0)