Skip to content

Commit 7b20915

Browse files
zRzRzRzRzRzRzRzucchini-nlpArthurZucker
authored
GLM-4.5V Model Support (#39805)
* init * update * uupdate * ruff * t patch is 2 defalut not 1 * draft * back * back1 * update * config update * update using glm-41 format * add self.rope_scaling = config.rope_scaling * update config * update * remove the processor * update * fix tests * update * for test * update * update 2126 * self.rope_scaling is missing in GLM4MOE lets add it * update * update * Update modular_glm4v_moe.py * change config * update apply_multimodal_rotary_pos_emb * format * update * Delete 3-rollout_qas_thinking_answers.py * use right name * update with place holder * update * use right rotary * Update image_processing_glm4v_fast.py * rope_config_validation needs to rewrite the entire config file in modular * update * changed name * update * Update modeling_glm4v_moe.py * _init_weights shoud be add in Glm4vMoePreTrainedModel * remove use_qk_norm * Update modular_glm4v_moe.py * remove use_qk_norm as it is not use * fix style * deprecations are not needed on new models * fix merge issues --------- Co-authored-by: raushan <[email protected]> Co-authored-by: Arthur <[email protected]> Co-authored-by: Arthur <[email protected]>
1 parent d2ba153 commit 7b20915

20 files changed

+3635
-448
lines changed

docs/source/en/_toctree.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,8 @@
10091009
title: GIT
10101010
- local: model_doc/glm4v
10111011
title: glm4v
1012+
- local: model_doc/glm4v_moe
1013+
title: glm4v_moe
10121014
- local: model_doc/got_ocr2
10131015
title: GOT-OCR2
10141016
- local: model_doc/granitevision

docs/source/en/model_doc/glm4v_moe.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
<!--Copyright 2025 The ZhipuAI Inc. and The HuggingFace Inc. team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License.
11+
12+
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13+
rendered properly in your Markdown viewer.
14+
15+
-->
16+
17+
<div style="float: right;">
18+
<div class="flex flex-wrap space-x-1">
19+
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
20+
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
21+
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white"> </div>
22+
</div>
23+
24+
# Glm4vMoe
25+
26+
## Overview
27+
28+
The Glm4vMoe model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
29+
<INSERT SHORT SUMMARY HERE>
30+
31+
The abstract from the paper is the following:
32+
33+
*<INSERT PAPER ABSTRACT HERE>*
34+
35+
Tips:
36+
37+
<INSERT TIPS ABOUT MODEL HERE>
38+
39+
This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
40+
The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
41+
42+
43+
## Glm4vMoeConfig
44+
45+
[[autodoc]] Glm4vMoeConfig
46+
47+
## Glm4vMoeTextConfig
48+
49+
[[autodoc]] Glm4vMoeTextConfig
50+
51+
## Glm4vMoeTextModel
52+
53+
[[autodoc]] Glm4vMoeTextModel
54+
- forward
55+
56+
## Glm4vMoeModel
57+
58+
[[autodoc]] Glm4vMoeModel
59+
- forward
60+
61+
## Glm4vMoeForConditionalGeneration
62+
63+
[[autodoc]] Glm4vMoeForConditionalGeneration
64+
- forward

src/transformers/models/auto/configuration_auto.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@
163163
("glm4", "Glm4Config"),
164164
("glm4_moe", "Glm4MoeConfig"),
165165
("glm4v", "Glm4vConfig"),
166+
("glm4v_moe", "Glm4vMoeConfig"),
167+
("glm4v_moe_text", "Glm4vMoeTextConfig"),
166168
("glm4v_text", "Glm4vTextConfig"),
167169
("glpn", "GLPNConfig"),
168170
("got_ocr2", "GotOcr2Config"),
@@ -569,6 +571,8 @@
569571
("glm4", "GLM4"),
570572
("glm4_moe", "Glm4MoE"),
571573
("glm4v", "GLM4V"),
574+
("glm4v_moe", "GLM4VMOE"),
575+
("glm4v_moe_text", "GLM4VMOE"),
572576
("glm4v_text", "GLM4V"),
573577
("glpn", "GLPN"),
574578
("got_ocr2", "GOT-OCR2"),
@@ -900,6 +904,7 @@
900904
("gemma3n_text", "gemma3n"),
901905
("gemma3n_vision", "gemma3n"),
902906
("glm4v_text", "glm4v"),
907+
("glm4v_moe_text", "glm4v_moe"),
903908
("idefics3_vision", "idefics3"),
904909
("siglip_vision_model", "siglip"),
905910
("aimv2_vision_model", "aimv2"),

src/transformers/models/auto/modeling_auto.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,8 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
165165
("glm4", "Glm4Model"),
166166
("glm4_moe", "Glm4MoeModel"),
167167
("glm4v", "Glm4vModel"),
168+
("glm4v_moe", "Glm4vMoeModel"),
169+
("glm4v_moe_text", "Glm4vMoeTextModel"),
168170
("glm4v_text", "Glm4vTextModel"),
169171
("glpn", "GLPNModel"),
170172
("got_ocr2", "GotOcr2Model"),
@@ -970,6 +972,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
970972
("gemma3n", "Gemma3nForConditionalGeneration"),
971973
("git", "GitForCausalLM"),
972974
("glm4v", "Glm4vForConditionalGeneration"),
975+
("glm4v_moe", "Glm4vMoeForConditionalGeneration"),
973976
("got_ocr2", "GotOcr2ForConditionalGeneration"),
974977
("idefics", "IdeficsForVisionText2Text"),
975978
("idefics2", "Idefics2ForConditionalGeneration"),

src/transformers/models/auto/processing_auto.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
("gemma3n", "Gemma3nProcessor"),
7575
("git", "GitProcessor"),
7676
("glm4v", "Glm4vProcessor"),
77+
("glm4v_moe", "Glm4vProcessor"),
7778
("got_ocr2", "GotOcr2Processor"),
7879
("granite_speech", "GraniteSpeechProcessor"),
7980
("grounding-dino", "GroundingDinoProcessor"),

src/transformers/models/auto/tokenization_auto.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@
294294
("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
295295
("glm4_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
296296
("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
297+
("glm4v_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
297298
("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
298299
("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
299300
("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),

src/transformers/models/glm4_moe/modeling_glm4_moe.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None):
135135
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
136136
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
137137
self.scaling = self.head_dim**-0.5
138+
self.rope_scaling = config.rope_scaling
138139
self.attention_dropout = config.attention_dropout
139140
self.is_causal = True
140141

src/transformers/models/glm4_moe/modular_glm4_moe.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None):
263263
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
264264
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
265265
self.scaling = self.head_dim**-0.5
266+
self.rope_scaling = config.rope_scaling
266267
self.attention_dropout = config.attention_dropout
267268
self.is_causal = True
268269

src/transformers/models/glm4v/configuration_glm4v.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def __init__(
9494
patch_size=14,
9595
rms_norm_eps=1e-05,
9696
spatial_merge_size=2,
97-
temporal_patch_size=1,
97+
temporal_patch_size=2,
9898
out_hidden_size=4096,
9999
intermediate_size=13696,
100100
initializer_range=0.02,

src/transformers/models/glm4v/image_processing_glm4v_fast.py

Lines changed: 36 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
from ...image_processing_utils_fast import (
2323
BaseImageProcessorFast,
2424
DefaultFastImageProcessorKwargs,
25-
group_images_by_shape,
26-
reorder_images,
2725
)
2826
from ...image_utils import (
2927
OPENAI_CLIP_MEAN,
@@ -47,7 +45,6 @@
4745
if is_torch_available():
4846
import torch
4947

50-
5148
if is_torchvision_available():
5249
if is_torchvision_v2_available():
5350
from torchvision.transforms.v2 import functional as F
@@ -112,48 +109,44 @@ def _preprocess(
112109
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
113110
"""
114111

115-
# Group images by size for batched resizing
116-
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
117-
resized_images_grouped = {}
118-
for shape, stacked_images in grouped_images.items():
119-
height, width = stacked_images.shape[-2:]
112+
processed_images = []
113+
processed_grids = []
114+
115+
all_target_sizes = []
116+
for image in images:
117+
height, width = image.shape[-2:]
118+
resized_height, resized_width = smart_resize(
119+
num_frames=temporal_patch_size,
120+
height=height,
121+
width=width,
122+
temporal_factor=temporal_patch_size,
123+
factor=patch_size * merge_size,
124+
)
125+
all_target_sizes.append((resized_height, resized_width))
126+
127+
target_height = max([s[0] for s in all_target_sizes])
128+
target_width = max([s[1] for s in all_target_sizes])
129+
130+
for image in images:
120131
if do_resize:
121-
resized_height, resized_width = smart_resize(
122-
num_frames=temporal_patch_size,
123-
height=height,
124-
width=width,
125-
temporal_factor=temporal_patch_size,
126-
factor=patch_size * merge_size,
127-
)
128-
stacked_images = self.resize(
129-
stacked_images,
130-
size=SizeDict(height=resized_height, width=resized_width),
132+
image = self.resize(
133+
image,
134+
size=SizeDict(height=target_height, width=target_width),
131135
interpolation=interpolation,
132136
)
133-
resized_images_grouped[shape] = stacked_images
134-
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
135-
# Group images by size for further processing
136-
# Needed in case do_resize is False, or resize returns images with different sizes
137-
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
138-
processed_images_grouped = {}
139-
processed_grids = {}
140-
for shape, stacked_images in grouped_images.items():
141-
resized_height, resized_width = stacked_images.shape[-2:]
142-
# Fused rescale and normalize
143-
stacked_images = self.rescale_and_normalize(
144-
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
145-
)
146-
# add a temporal dimension
147-
patches = stacked_images.unsqueeze(1)
148-
if patches.shape[1] % temporal_patch_size != 0:
149-
repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
150-
patches = torch.cat([patches, repeats], dim=1)
151-
batch_size, grid_t, channel = patches.shape[:3]
152-
grid_t = grid_t // temporal_patch_size
153-
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
154137

138+
image = self.rescale_and_normalize(
139+
image.unsqueeze(0), do_rescale, rescale_factor, do_normalize, image_mean, image_std
140+
).squeeze(0)
141+
142+
patches = image.unsqueeze(0)
143+
if patches.shape[0] % temporal_patch_size != 0:
144+
repeats = patches[-1:].repeat(temporal_patch_size - (patches.shape[0] % temporal_patch_size), 1, 1, 1)
145+
patches = torch.cat([patches, repeats], dim=0)
146+
channel = patches.shape[1]
147+
grid_t = patches.shape[0] // temporal_patch_size
148+
grid_h, grid_w = target_height // patch_size, target_width // patch_size
155149
patches = patches.view(
156-
batch_size,
157150
grid_t,
158151
temporal_patch_size,
159152
channel,
@@ -164,18 +157,14 @@ def _preprocess(
164157
merge_size,
165158
patch_size,
166159
)
167-
patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
160+
patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
168161
flatten_patches = patches.reshape(
169-
batch_size,
170162
grid_t * grid_h * grid_w,
171163
channel * temporal_patch_size * patch_size * patch_size,
172164
)
165+
processed_images.append(flatten_patches)
166+
processed_grids.append([grid_t, grid_h, grid_w])
173167

174-
processed_images_grouped[shape] = flatten_patches
175-
processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
176-
177-
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
178-
processed_grids = reorder_images(processed_grids, grouped_images_index)
179168
pixel_values = torch.stack(processed_images, dim=0)
180169
image_grid_thw = torch.tensor(processed_grids)
181170

0 commit comments

Comments
 (0)