Skip to content

Commit 3972293

Browse files
committed
moe wip
1 parent 9573de8 commit 3972293

File tree

5 files changed

+237
-7
lines changed

5 files changed

+237
-7
lines changed
Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
"""
2+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
17+
from typing import Dict
18+
19+
import paddle
20+
from paddle import nn
21+
22+
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import (
23+
UnquantizedFusedMoEMethod,
24+
)
25+
from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
26+
from fastdeploy.model_executor.layers.quantization.weight_only import WeightOnlyConfig
27+
from fastdeploy.model_executor.ops.npu import npu_quant_weight
28+
29+
30+
class NPUMoEMethod(UnquantizedFusedMoEMethod):
31+
"""
32+
NPU MOE
33+
"""
34+
35+
def process_loaded_weights(self, layer: nn.Layer, state_dict):
36+
37+
up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
38+
for weights in [up_gate_proj_weights, down_proj_weights]:
39+
for idx, weight in enumerate(weights):
40+
weights[idx] = weight.transpose([1, 0])
41+
stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
42+
stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
43+
44+
layer.up_gate_proj_weight.set_value(stacked_up_gate_proj_weights)
45+
layer.down_proj_weight.set_value(stacked_down_proj_weights)
46+
47+
def apply_tp(
48+
self,
49+
layer: nn.Layer,
50+
x: paddle.Tensor,
51+
gate: nn.Layer,
52+
) -> paddle.Tensor:
53+
"""
54+
Paddle Cutlass compute Fused MoE.
55+
"""
56+
from fastdeploy.model_executor.ops.npu import fused_sparse_moe
57+
fused_moe_out = fused_sparse_moe(
58+
x,
59+
gate.weight.transpose([1, 0]),
60+
layer.up_gate_proj_weight,
61+
layer.down_proj_weight,
62+
None, # ffn1_bias
63+
None, # ffn1_scale
64+
None, # ffn2_bias
65+
None, # ffn2_scale
66+
self.moe_quant_type,
67+
layer.top_k,
68+
layer.tp_size
69+
)
70+
if layer.tp_size > 1:
71+
from fastdeploy.distributed.communication import (
72+
tensor_model_parallel_all_reduce,
73+
)
74+
75+
tensor_model_parallel_all_reduce(fused_moe_out)
76+
77+
return fused_moe_out
78+
79+
def apply_ep_prefill(
80+
self,
81+
layer: nn.Layer,
82+
x: paddle.Tensor,
83+
gate: nn.Layer,
84+
) -> paddle.Tensor:
85+
"""
86+
Apply the EP prefill method.
87+
"""
88+
raise NotImplementedError
89+
90+
def apply_ep_decode(
91+
self,
92+
layer: nn.Layer,
93+
x: paddle.Tensor,
94+
gate: nn.Layer,
95+
) -> paddle.Tensor:
96+
"""
97+
Apply the EP decoder method.
98+
"""
99+
raise NotImplementedError
100+
101+
102+
class NPUWeightOnlyMoEMethod(QuantMethodBase):
103+
"""
104+
NPU Fused MoE Method.
105+
"""
106+
107+
def __init__(
108+
self,
109+
quant_config: WeightOnlyConfig,
110+
) -> None:
111+
super().__init__()
112+
self.quant_config = quant_config
113+
self.moe_quant_type = self.quant_config.algo
114+
115+
def create_weights(self, layer: nn.Layer, state_dict: Dict[str, paddle.Tensor]):
116+
"""
117+
Paddle cutlass create weight process.
118+
"""
119+
up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
120+
assert len(up_gate_proj_weights) == layer.num_local_experts
121+
assert len(down_proj_weights) == layer.num_local_experts
122+
assert up_gate_proj_weights[0].shape == [
123+
layer.hidden_size,
124+
layer.moe_intermediate_size * 2,
125+
]
126+
assert down_proj_weights[0].shape == [
127+
layer.moe_intermediate_size,
128+
layer.hidden_size,
129+
]
130+
131+
added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
132+
added_scale_attrs = [
133+
"up_gate_proj_weight_scale",
134+
"down_proj_weight_scale",
135+
]
136+
137+
for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
138+
weight_name = added_weight_attrs[idx]
139+
scale_name = added_scale_attrs[idx]
140+
141+
weight_list = []
142+
weight_scale_list = []
143+
for i in range(layer.num_local_experts):
144+
quant_weight, scale = npu_quant_weight(
145+
weight_tensor[i], self.moe_quant_type, -1, -1
146+
) # weight is [k,n]
147+
weight_list.append(quant_weight.transpose([1, 0])) # transpose weight to [n,k]
148+
weight_scale_list.append(scale)
149+
quanted_weight = paddle.stack(weight_list, axis=0)
150+
setattr(
151+
layer,
152+
weight_name,
153+
layer.create_parameter(
154+
shape=quanted_weight.shape,
155+
dtype=quanted_weight.dtype,
156+
default_initializer=paddle.nn.initializer.Constant(0),
157+
),
158+
)
159+
getattr(layer, weight_name).set_value(quanted_weight)
160+
161+
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
162+
setattr(
163+
layer,
164+
scale_name,
165+
layer.create_parameter(
166+
shape=quanted_weight_scale.shape,
167+
dtype=quanted_weight_scale.dtype,
168+
),
169+
)
170+
getattr(layer, scale_name).set_value(quanted_weight_scale)
171+
172+
def apply(
173+
self,
174+
layer: nn.Layer,
175+
x: paddle.Tensor,
176+
gate: nn.Layer,
177+
) -> paddle.Tensor:
178+
"""
179+
NPU compute Fused MoE.
180+
"""
181+
from fastdeploy.model_executor.ops.npu import fused_sparse_moe
182+
fused_moe_out = fused_sparse_moe(
183+
x,
184+
gate.weight.transpose([1, 0]),
185+
layer.up_gate_proj_weight,
186+
layer.down_proj_weight,
187+
None, # ffn1_bias
188+
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
189+
None, # ffn2_bias
190+
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
191+
self.moe_quant_type,
192+
layer.top_k,
193+
layer.tp_size
194+
)
195+
if layer.tp_size > 1:
196+
from fastdeploy.distributed.communication import (
197+
tensor_model_parallel_all_reduce,
198+
)
199+
200+
tensor_model_parallel_all_reduce(fused_moe_out)
201+
202+
return fused_moe_out

fastdeploy/model_executor/layers/moe/moe.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ def get_moe_method():
5555
)
5656

5757
return MetaxTritonWeightOnlyMoEMethod(None)
58+
elif current_platform.is_npu():
59+
from .fused_moe_npu_backend import NPUMoEMethod
60+
61+
return NPUMoEMethod(None)
5862
raise NotImplementedError
5963

6064

fastdeploy/model_executor/layers/quantization/weight_only.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,11 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
105105

106106
return GPUWeightOnlyLinearMethod(self)
107107
elif current_platform.is_npu():
108-
from fastdeploy.model_executor.layers.backends import NPUWeightOnlyLinearMethod
109-
return NPUWeightOnlyLinearMethod(self)
108+
from fastdeploy.model_executor.layers.backends import (NPUWeightOnlyLinearMethod, NPUWeightOnlyMoEMethod)
109+
if isinstance(layer, FusedMoe):
110+
return NPUWeightOnlyMoEMethod(self)
111+
else:
112+
return NPUWeightOnlyLinearMethod(self)
110113
else:
111114
if isinstance(layer, FusedMoE):
112115
if layer.use_method == "cutlass":

fastdeploy/model_executor/ops/npu/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from .get_token_penalty_multi_scores import get_token_penalty_multi_scores_npu
2929
from .top_p_sampling import top_p_sampling_npu
3030
from .weight_quantize import npu_quant_weight
31+
from .sparse_moe import fused_sparse_moe
3132

3233
PACKAGE = "fastdeploy.model_executor.ops.npu"
3334

fastdeploy/model_executor/ops/npu/sparse_moe.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import inspect
22

33
import paddle
4-
import paddlenlp_ops
54
from paddle.base import core
5+
import inspect
6+
67

78

89
# npu interface refer to gpu interface
@@ -22,11 +23,28 @@ def fused_sparse_moe(
2223
"""
2324
call npu func to implement this function
2425
"""
25-
ffn1_weight = paddle.cast(ffn1_weight, paddle.bfloat16)
26-
ffn2_weight = paddle.cast(ffn2_weight, paddle.bfloat16)
26+
frame=inspect.currentframe()
27+
args, _, _, values = inspect.getargvalues(frame)
28+
params = {arg: values[arg] for arg in args}
29+
for k,v in params.items():
30+
if isinstance(v, paddle.Tensor):
31+
print(f"{k}: {v.shape}, {v.dtype}")
32+
33+
# Transpose weights to match expected format: [num_experts, input_dim, output_dim]
34+
print(f"Original ffn1_weight shape: {ffn1_weight.shape}")
35+
print(f"Original ffn2_weight shape: {ffn2_weight.shape}")
36+
37+
ffn1_weight = ffn1_weight.transpose([0, 2, 1]) # [64, 3072, 2560] -> [64, 2560, 3072]
38+
ffn2_weight = ffn2_weight.transpose([0, 2, 1]) # [64, 2560, 1536] -> [64, 1536, 2560]
39+
40+
41+
print(f"Transformed ffn1_weight shape: {ffn1_weight.shape}")
42+
print(f"Transformed ffn2_weight shape: {ffn2_weight.shape}")
2743

2844

45+
print(f"Original gate_weight shape: {gate_weight.shape}")
2946
gate_weight = gate_weight.transpose([1, 0]).astype(input.dtype)
47+
print(f"Transformed gate_weight shape: {gate_weight.shape}")
3048

3149
temp = paddle.zeros([1]).astype(input.dtype)
3250

@@ -42,7 +60,9 @@ def fused_sparse_moe(
4260
quanttype = 6
4361
else:
4462
quanttype = 1
45-
y = paddlenlp_ops.sparse_moe(
63+
64+
y = core.eager._run_custom_op(
65+
"sparse_moe",
4666
input,
4767
gate_weight,
4868
temp,
@@ -68,7 +88,7 @@ def fused_sparse_moe(
6888
zero_hot,
6989
moe_topk,
7090
input.dtype == paddle.bfloat16,
71-
tp_size,
91+
tp_size,
7292
quanttype,
7393
)
7494
return y

0 commit comments

Comments
 (0)