Skip to content

Commit c47c17b

Browse files
committed
Add Glm4V-MoE architecture
1 parent 78c8048 commit c47c17b

File tree

6 files changed

+129
-16
lines changed

6 files changed

+129
-16
lines changed

examples/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,4 @@ def get_stop_conditions(prompt_format, tokenizer):
7575
case "mistral":
7676
return [tokenizer.eos_token_id]
7777
case "glmv":
78-
return [tokenizer.eos_token_id, "</answer>"]
78+
return [tokenizer.eos_token_id, "</answer>", "<|user|>"]

examples/multimodal.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
model_dir = "/mnt/str/models/qwen3-vl-30b-a3b-instruct/exl3/5.00bpw"
2525
case "glm":
2626
prompt_format = "glmv"
27-
model_dir = "/mnt/str/models/glm4.1v-9b-thinking/exl3/2.0bpw"
27+
model_dir = "/mnt/str/models/glm4.5v/exl3/4.00bpw"
2828

2929
images = [
3030
# Cat

exllamav3/architecture/architectures.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from .glm4 import Glm4Model
1313
from .glm4_moe import Glm4MoeModel
1414
from .glm4v import Glm4VModel
15+
from .glm4v_moe import Glm4VMoeModel
1516
from .llama import LlamaModel
1617
from .mimo import MiMoModel
1718
from .minimax_m2 import MiniMaxM2Model
@@ -49,6 +50,7 @@
4950
Glm4Model,
5051
Glm4MoeModel,
5152
Glm4VModel,
53+
Glm4VMoeModel,
5254
LlamaModel,
5355
MiMoModel,
5456
MiniMaxM2Model,

exllamav3/architecture/glm4_moe.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
from ..modules import RMSNorm, Embedding, TransformerBlock, Attention, GatedMLP, Linear, BlockSparseMLP
88
from ..modules.attn import prepare_for_attn
99

10+
from typing import TYPE_CHECKING
11+
if TYPE_CHECKING:
12+
from .glm4v_moe import Glm4VMoeConfig
13+
1014
class Glm4MoeConfig(Config):
1115
arch_string = "Glm4MoeForCausalLM"
1216

@@ -58,15 +62,16 @@ class Glm4MoeModel(Model):
5862

5963
def __init__(
6064
self,
61-
config: Glm4MoeConfig,
65+
config: Glm4MoeConfig | Glm4VMoeConfig,
66+
key_prefix: str = "model",
6267
**kwargs
6368
):
6469
super().__init__(config, **kwargs)
6570

6671
self.modules += [
6772
Embedding(
6873
config = config,
69-
key = "model.embed_tokens",
74+
key = f"{key_prefix}.embed_tokens",
7075
vocab_size = config.vocab_size,
7176
hidden_size = config.hidden_size,
7277
)
@@ -77,15 +82,15 @@ def __init__(
7782
self.modules += [
7883
TransformerBlock(
7984
config = config,
80-
key = f"model.layers.{idx}",
85+
key = f"{key_prefix}.layers.{idx}",
8186
attn_norm = RMSNorm(
8287
config = config,
83-
key = f"model.layers.{idx}.input_layernorm",
88+
key = f"{key_prefix}.layers.{idx}.input_layernorm",
8489
rms_norm_eps = config.rms_norm_eps,
8590
),
8691
attn = Attention(
8792
config = config,
88-
key = f"model.layers.{idx}.self_attn",
93+
key = f"{key_prefix}.layers.{idx}.self_attn",
8994
layer_idx = idx,
9095
hidden_size = config.hidden_size,
9196
head_dim = config.head_dim,
@@ -100,25 +105,25 @@ def __init__(
100105
qmap = "block.attn",
101106
q_norm = RMSNorm(
102107
config = config,
103-
key = f"model.layers.{idx}.self_attn.q_norm",
108+
key = f"{key_prefix}.layers.{idx}.self_attn.q_norm",
104109
rms_norm_eps = config.rms_norm_eps,
105110
) if config.use_qk_norm else None,
106111
k_norm = RMSNorm(
107112
config = config,
108-
key = f"model.layers.{idx}.self_attn.k_norm",
113+
key = f"{key_prefix}.layers.{idx}.self_attn.k_norm",
109114
rms_norm_eps = config.rms_norm_eps,
110115
) if config.use_qk_norm else None,
111116
out_dtype = torch.float
112117
),
113118
mlp_norm = RMSNorm(
114119
config = config,
115-
key = f"model.layers.{idx}.post_attention_layernorm",
120+
key = f"{key_prefix}.layers.{idx}.post_attention_layernorm",
116121
rms_norm_eps = config.rms_norm_eps,
117122
),
118123
mlp = (
119124
GatedMLP(
120125
config = config,
121-
key = f"model.layers.{idx}.mlp",
126+
key = f"{key_prefix}.layers.{idx}.mlp",
122127
hidden_size = config.hidden_size,
123128
intermediate_size = config.intermediate_size,
124129
key_up = "up_proj",
@@ -131,7 +136,7 @@ def __init__(
131136
if idx < config.first_k_dense_replace else
132137
BlockSparseMLP(
133138
config = config,
134-
key = f"model.layers.{idx}.mlp",
139+
key = f"{key_prefix}.layers.{idx}.mlp",
135140
hidden_size = config.hidden_size,
136141
intermediate_size = config.moe_intermediate_size,
137142
num_experts = config.num_experts,
@@ -150,7 +155,7 @@ def __init__(
150155
topk_group = 1,
151156
shared_experts = GatedMLP(
152157
config = config,
153-
key = f"model.layers.{idx}.mlp.shared_experts",
158+
key = f"{key_prefix}.layers.{idx}.mlp.shared_experts",
154159
hidden_size = config.hidden_size,
155160
intermediate_size = config.moe_intermediate_size * config.num_shared_experts,
156161
key_up = "up_proj",
@@ -170,12 +175,12 @@ def __init__(
170175

171176
head_alt_key = None
172177
if config.tie_word_embeddings and not self.config.stc.has_tensor("lm_head"):
173-
head_alt_key = "model.embed_tokens"
178+
head_alt_key = f"{key_prefix}.embed_tokens"
174179

175180
self.modules += [
176181
RMSNorm(
177182
config = config,
178-
key = "model.norm",
183+
key = f"{key_prefix}.norm",
179184
rms_norm_eps = config.rms_norm_eps,
180185
out_dtype = torch.half,
181186
),

exllamav3/architecture/glm4v.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@
2828
from PIL import Image
2929
import os, json
3030

31+
from typing import TYPE_CHECKING
32+
if TYPE_CHECKING:
33+
from .glm4v_moe import Glm4VMoeConfig
34+
3135
class Glm4VConfig(Config):
3236
arch_string = "Glm4vForConditionalGeneration"
3337

@@ -262,7 +266,7 @@ def get_additional_compiled_tensors(config: Glm4VConfig) -> dict:
262266

263267
def __init__(
264268
self,
265-
config: Glm4VConfig,
269+
config: Glm4VConfig | Glm4VMoeConfig,
266270
key_prefix = "model.visual",
267271
**kwargs
268272
):
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
from __future__ import annotations
2+
from typing_extensions import override
3+
import numpy as np
4+
import torch
5+
import torch.nn.functional as F
6+
from ..model.config import Config, no_default
7+
from ..model.model import Model
8+
from ..util.rope import RopeStyle, position_embedding_grid_2d, RopeSettings
9+
from ..util.file import read_dict, no_value, no_default
10+
from ..util.vision import convert_to_rgb, normalize_image, smart_resize
11+
from ..modules import (
12+
Module,
13+
RMSNorm,
14+
Embedding,
15+
TransformerBlock,
16+
Attention,
17+
GatedMLP,
18+
Linear,
19+
Conv,
20+
LayerNorm,
21+
Glm4VPosEmbedding,
22+
MLP
23+
)
24+
from ..modules.attn import prepare_for_attn
25+
from .glm4_moe import Glm4MoeModel
26+
from types import SimpleNamespace
27+
from ..tokenizer import Tokenizer, MMEmbedding
28+
from PIL import Image
29+
import os, json
30+
from .glm4v import read_glm4v_vision_config, read_glm4v_pp_config, Glm4VVisionModel
31+
32+
class Glm4VMoeConfig(Config):
33+
arch_string = "Glm4vMoeForConditionalGeneration"
34+
35+
def __init__(
36+
self,
37+
directory: str,
38+
**kwargs,
39+
):
40+
super().__init__(
41+
directory,
42+
{"text": Glm4VMoeModel, "vision": Glm4VVisionModel},
43+
**kwargs
44+
)
45+
46+
# Attention params
47+
self.head_dim = self.read_cfg(int, "text_config->head_dim", None)
48+
self.hidden_size = self.read_cfg(int, "text_config->hidden_size", no_default)
49+
self.num_q_heads = self.read_cfg(int, "text_config->num_attention_heads", no_default)
50+
self.num_kv_heads = self.read_cfg(int, "text_config->num_key_value_heads", self.num_q_heads)
51+
self.use_qk_norm = self.read_cfg(bool, "text_config->use_qk_norm", False)
52+
53+
if not self.head_dim:
54+
self.head_dim = self.hidden_size // self.num_q_heads
55+
56+
# MLP params
57+
self.assert_cfg(str, "text_config->hidden_act", "silu", True)
58+
self.assert_cfg(bool, "text_config->norm_topk_prob", True, True)
59+
self.intermediate_size = self.read_cfg(int, "text_config->intermediate_size", no_default)
60+
self.moe_intermediate_size = self.read_cfg(int, "text_config->moe_intermediate_size", no_default)
61+
self.num_shared_experts = self.read_cfg(int, "text_config->n_shared_experts", 1)
62+
self.num_experts = self.read_cfg(int, "text_config->n_routed_experts", 128)
63+
self.num_experts_per_tok = self.read_cfg(int, "text_config->num_experts_per_tok", 8)
64+
self.first_k_dense_replace = self.read_cfg(int, "text_config->first_k_dense_replace", 3)
65+
self.routed_scaling_factor = self.read_cfg(float, "text_config->routed_scaling_factor", 2.5)
66+
67+
# Norms
68+
self.rms_norm_eps = self.read_cfg(float, "text_config->rms_norm_eps", no_default)
69+
70+
# Layers
71+
self.num_hidden_layers = self.read_cfg(int, "text_config->num_hidden_layers", no_default)
72+
self.tie_word_embeddings = self.read_cfg(bool, "tie_word_embeddings", False)
73+
74+
# RoPE
75+
self.rope_settings = self.read_rope_settings_default(
76+
RopeStyle.NEOX,
77+
default_rope_theta = 10000,
78+
config_dict = self.read_cfg(dict, "text_config", no_default)
79+
)
80+
81+
# Vision model settings
82+
read_vision_config = self.read_cfg(dict, "vision_config", no_default)
83+
self.vision = read_glm4v_vision_config(read_vision_config)
84+
85+
prep_path = os.path.join(self.directory, "preprocessor_config.json")
86+
with open(prep_path, encoding = "utf8") as f:
87+
read_prep_config = json.load(f)
88+
self.vision_pp = read_glm4v_pp_config(read_prep_config)
89+
90+
self.vision_start_token_id = self.read_cfg(int, "image_start_token_id", 151339)
91+
self.vision_end_token_id = self.read_cfg(int, "image_end_token_id", 151340)
92+
93+
94+
class Glm4VMoeModel(Glm4MoeModel):
95+
config_class = Glm4VMoeConfig
96+
97+
def __init__(
98+
self,
99+
config: Glm4VMoeConfig,
100+
**kwargs
101+
):
102+
super().__init__(config, key_prefix = "model.language_model", **kwargs)

0 commit comments

Comments
 (0)