Skip to content

Commit 20839ab

Browse files
authored
qwen3_moe (#3084)
1 parent 91dc87f commit 20839ab

30 files changed

+1361
-1087
lines changed

fastdeploy/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -663,7 +663,7 @@ class LoadChoices(str, Enum):
663663

664664
DEFAULT = "default"
665665
# only support qwen3-bf16 now
666-
NEW_LOADER = "new_loader"
666+
DEFAULT_V1 = "default_v1"
667667

668668

669669
class LoadConfig:

fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
from paddle import nn
2323
from paddleformers.utils.log import logger
2424

25-
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import MoEMethodBase
25+
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import (
26+
UnquantizedFusedMoEMethod,
27+
)
2628
from fastdeploy.model_executor.layers.utils import (
2729
CpuGuard,
2830
create_and_set_parameter,
@@ -37,7 +39,7 @@
3739
)
3840

3941

40-
class GCUFusedMoeMethod(MoEMethodBase):
42+
class GCUFusedMoeMethod(UnquantizedFusedMoEMethod):
4143
"""
4244
Use GCU to compute Fused MoE.
4345
"""
@@ -46,28 +48,12 @@ def __init__(self, quant_config):
4648
super().__init__(quant_config)
4749
self.group_size = -1
4850

49-
def create_weights(self, layer: nn.Layer, state_dict):
50-
"""
51-
Paddle gcu create weight process.
52-
"""
53-
# bf16
51+
def process_loaded_weights(self, layer: nn.Layer, state_dict):
5452
up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
5553
stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
5654
stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
57-
for idx, weight_tensor in enumerate([stacked_up_gate_proj_weights, stacked_down_proj_weights]):
58-
# shape [E, K, N] -> [E, N, K]
59-
weight_tensor = paddle.transpose(weight_tensor, [0, 2, 1])
60-
weight_name = self.added_weight_attrs[idx]
61-
setattr(
62-
layer,
63-
weight_name,
64-
layer.create_parameter(
65-
shape=weight_tensor.shape,
66-
dtype=weight_tensor.dtype,
67-
default_initializer=paddle.nn.initializer.Constant(0),
68-
),
69-
)
70-
getattr(layer, weight_name).set_value(weight_tensor)
55+
layer.up_gate_proj_weight.set_value(paddle.transpose(stacked_up_gate_proj_weights, [0, 2, 1]))
56+
layer.down_proj_weight.set_value(paddle.transpose(stacked_down_proj_weights, [0, 2, 1]))
7157

7258
@paddle.no_grad()
7359
def compute_ffn(
@@ -202,18 +188,19 @@ def apply(
202188
self,
203189
layer: nn.Layer,
204190
x: paddle.Tensor,
205-
gate_out: paddle.Tensor,
191+
gate: nn.Layer,
206192
) -> paddle.Tensor:
207193
"""
208194
Paddle gcu compute Fused MoE.
209195
"""
196+
gate_out = gate(x.cast("float32"))
210197
return self.compute_ffn(layer, x, gate_out, enable_quant=False)
211198

212199
def apply_ep_prefill(
213200
self,
214201
layer: nn.Layer,
215202
x: paddle.Tensor,
216-
gate_out: paddle.Tensor,
203+
gate: nn.Layer,
217204
) -> paddle.Tensor:
218205
"""
219206
Apply the EP prefill method.
@@ -224,7 +211,7 @@ def apply_ep_decode(
224211
self,
225212
layer: nn.Layer,
226213
x: paddle.Tensor,
227-
gate_out: paddle.Tensor,
214+
gate: nn.Layer,
228215
) -> paddle.Tensor:
229216
"""
230217
Apply the EP decoder method.
@@ -235,7 +222,7 @@ def apply_tp(
235222
self,
236223
layer: nn.Layer,
237224
x: paddle.Tensor,
238-
gate_out: paddle.Tensor,
225+
gate: nn.Layer,
239226
) -> paddle.Tensor:
240227
"""
241228
Paddle Cutlass compute Fused MoE.
@@ -400,9 +387,10 @@ def apply(
400387
self,
401388
layer: nn.Layer,
402389
x: paddle.Tensor,
403-
gate_out: paddle.Tensor,
390+
gate: nn.Layer,
404391
) -> paddle.Tensor:
405392
"""
406393
Paddle gcu compute Fused MoE.
407394
"""
395+
gate_out = gate(x.cast("float32"))
408396
return self.compute_ffn(layer, x, gate_out, enable_quant=True)

fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,22 @@ def __init__(
3737
self.quant_config = quant_config
3838
self.group_size = -1
3939

40-
def create_weights(self, layer):
40+
def create_weights(self, layer, **extra_weight_attrs):
4141
# The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
4242
weight_scale_shape = [layer.weight_shape[1]]
4343

4444
layer.weight_shape.reverse()
4545
if self.quant_config.name() == "wint4":
4646
layer.weight_shape[0] //= 2
4747
layer.weight_dtype = "int8"
48+
49+
layer.weight = layer.create_parameter(
50+
shape=layer.weight_shape,
51+
dtype=layer.weight_dtype,
52+
is_bias=False,
53+
default_initializer=paddle.nn.initializer.Constant(0),
54+
)
55+
4856
layer.weight_scale = layer.create_parameter(
4957
shape=weight_scale_shape,
5058
dtype=layer._dtype,

fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __init__(
3535
) -> None:
3636
super().__init__(quant_config)
3737

38-
def create_weights(self, layer: nn.Layer) -> None:
38+
def create_weights(self, layer: nn.Layer, **extra_weight_attrs) -> None:
3939
"""
4040
Create weights for linear layer on XPU
4141
"""
@@ -45,6 +45,12 @@ def create_weights(self, layer: nn.Layer) -> None:
4545
if self.quant_config.name() == "weight_only_int4":
4646
layer.weight_shape[0] //= 2
4747
layer.weight_dtype = "int8"
48+
layer.weight = layer.create_parameter(
49+
shape=layer.weight_shape,
50+
dtype=layer.weight_dtype,
51+
is_bias=False,
52+
default_initializer=paddle.nn.initializer.Constant(0),
53+
)
4854
layer.weight_scale = layer.create_parameter(
4955
shape=weight_scale_shape,
5056
dtype="float32",

0 commit comments

Comments
 (0)