Skip to content

Commit 7207f7e

Browse files
committed
npu support
1 parent be94bdd commit 7207f7e

40 files changed

+2041
-12
lines changed

build.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,7 @@ function copy_ops(){
104104
is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
105105
if [ "$is_npu" = "True" ]; then
106106
DEVICE_TYPE="npu"
107-
cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/npu
108-
echo -e "npu ops have been copy to fastdeploy"
107+
echo -e "npu ops are already present in fastdeploy"
109108
return
110109
fi
111110

@@ -153,6 +152,7 @@ function build_and_install_ops() {
153152
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
154153
TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
155154
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
155+
is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
156156
if [ "$is_xpu" = "True" ]; then
157157
cd xpu_ops/src
158158
bash build.sh ${TMP_DIR_REAL_PATH}
@@ -164,6 +164,8 @@ function build_and_install_ops() {
164164
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
165165
fi
166166
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
167+
elif [ "$is_npu" = "True" ]; then
168+
echo -e "${BLUE}[build]${NONE} skipping NPU ops build (already present)"
167169
elif [ "$FD_CPU_USE_BF16" == "false" ]; then
168170
if [ "$FD_BUILDING_ARCS" == "" ]; then
169171
${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}

fastdeploy/model_executor/layers/activation.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def __init__(
7171
or current_platform.is_maca()
7272
):
7373
self.forward = self.forward_cuda
74-
elif current_platform.is_gcu():
74+
elif current_platform.is_gcu() or current_platform.is_npu():
7575
self.forward = self.forward_gcu
7676
else:
7777
raise NotImplementedError
@@ -147,3 +147,5 @@ def forward_gcu(self, x):
147147
if self.bias is not None:
148148
out = out + self.bias
149149
return out
150+
151+

fastdeploy/model_executor/layers/attention/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from .mla_attention_backend import MLAAttentionBackend
2323
from .native_paddle_backend import PaddleNativeAttnBackend
2424
from .xpu_attn_backend import XPUAttentionBackend
25+
from .npu_fapa_attn_backend import NpuFaPaAttentionBackend
2526

2627
__all__ = [
2728
"AttentionBackend",
@@ -34,4 +35,5 @@
3435
"IluvatarAttnBackend",
3536
"BlockAttentionBackend",
3637
"Attention",
38+
"NpuFaPaAttentionBackend"
3739
]
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
"""
2+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
17+
from __future__ import annotations
18+
19+
import os
20+
from dataclasses import dataclass, field
21+
from typing import TYPE_CHECKING, List, Optional
22+
from paddle import core
23+
24+
import paddle
25+
from fastdeploy.model_executor.layers.attention.ops import (
26+
get_block_shape_and_split_kv_block, init_signal_layerwise,
27+
open_shm_and_get_meta_signal)
28+
from fastdeploy.model_executor.ops.npu import fused_fapa_attention_npu
29+
30+
if TYPE_CHECKING:
31+
from paddle._typing.dtype_like import _DTypeLiteral
32+
33+
# from fastdeploy.config import LLMConfig
34+
from fastdeploy.model_executor.layers.attention import Attention
35+
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
36+
AttentionBackend, AttentionMetadata)
37+
38+
39+
@dataclass
40+
class NpuFaPaAttentionMetadata(AttentionMetadata):
41+
"""
42+
NpuFaPaAttentionMetadata
43+
"""
44+
45+
max_len_kv: paddle.Tensor = None
46+
set_max_lengths: int = -1
47+
encoder_batch_ids: paddle.Tensor = None
48+
encoder_tile_ids_per_batch: paddle.Tensor = None
49+
encoder_num_blocks: paddle.Tensor = None
50+
kv_batch_ids: paddle.Tensor = None
51+
kv_tile_ids_per_batch: paddle.Tensor = None
52+
kv_num_blocks: paddle.Tensor = None
53+
decoder_batch_ids: paddle.Tensor = None
54+
decoder_tile_ids_per_batch: paddle.Tensor = None
55+
decoder_num_blocks: paddle.Tensor = None
56+
57+
_dtype: _DTypeLiteral = paddle.bfloat16
58+
encoder_max_partition_size: int = 32768
59+
max_partition_size: int = 32768
60+
block_tables: Optional[paddle.Tensor] = None
61+
rotary_embs: Optional[paddle.Tensor] = None
62+
attn_mask: Optional[paddle.Tensor] = None
63+
encoder_block_shape_q: Optional[paddle.Tensor] = None
64+
decoder_block_shape_q: Optional[paddle.Tensor] = None
65+
_fuse_kernel_compute_dtype: str = "bf16"
66+
67+
# pd_disaggregation
68+
kv_signal_metadata: Optional[paddle.Tensor] = None
69+
kv_signal_data_list: List[paddle.Tensor] = field(default_factory=list)
70+
71+
72+
class NpuFaPaAttentionBackend(AttentionBackend):
73+
"""
74+
NpuFaPaAttentionBackend backend implementation.
75+
"""
76+
77+
def __init__(self, llm_config, kv_num_heads: int, num_heads: int, head_dim: int):
78+
"""
79+
NpuFaPaAttentionBackend __init__
80+
"""
81+
super().__init__()
82+
self.attention_metadata: NpuFaPaAttentionMetadata = None
83+
# TODO(gongshaotian): Use llm_config parameters in the correct location
84+
self.block_size = llm_config.parallel_config.block_size
85+
self.max_seq_len = llm_config.parallel_config.max_model_len
86+
self.rope_theta = (
87+
10000.0
88+
if llm_config.model_config.rope_theta is None
89+
else llm_config.model_config.rope_theta
90+
)
91+
self.rope_3d = getattr(llm_config.model_config, "rope_3d", False)
92+
self.causal = getattr(llm_config.model_config, "causal", True)
93+
self.speculate_method = llm_config.parallel_config.speculate_method
94+
self.use_speculate = self.speculate_method is not None
95+
self.speculate_max_draft_token_num = (
96+
llm_config.parallel_config.speculate_max_draft_tokens
97+
)
98+
self.keep_pd_step_flag = llm_config.speculative_config.is_mtp
99+
self.rank = llm_config.parallel_config.tensor_parallel_rank
100+
101+
self.kv_num_heads = kv_num_heads
102+
self.num_heads = num_heads
103+
self.head_dim = head_dim
104+
self.num_layers = llm_config.model_config.num_layers
105+
106+
# pd_disaggregation
107+
self.use_pd_disaggregation = int(os.getenv("FLAGS_use_pd_disaggregation", 0))
108+
self.start_layer_index = llm_config.model_config.start_layer_index
109+
110+
def init_attention_metadata(self, forward_meta):
111+
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
112+
metadata = NpuFaPaAttentionMetadata()
113+
metadata.encoder_block_shape_q = 64
114+
metadata.decoder_block_shape_q = 16
115+
metadata.max_partition_size = 32768
116+
metadata.encoder_max_partition_size = 32768
117+
metadata._dtype = paddle.get_default_dtype()
118+
if metadata._dtype == "bfloat16":
119+
metadata._fuse_kernel_compute_dtype = "bf16"
120+
elif metadata._dtype == "float16":
121+
metadata._fuse_kernel_compute_dtype = "fp16"
122+
elif metadata._dtype == "float32":
123+
metadata._fuse_kernel_compute_dtype = "fp32"
124+
metadata.block_tables = forward_meta.block_tables
125+
metadata.rotary_embs = forward_meta.rotary_embs
126+
metadata.attn_mask = forward_meta.attn_mask
127+
metadata.pre_caches_length = forward_meta.pre_caches_length
128+
129+
# # FIXME:
130+
# (
131+
# metadata.encoder_batch_ids,
132+
# metadata.encoder_tile_ids_per_batch,
133+
# metadata.encoder_num_blocks,
134+
# metadata.kv_batch_ids,
135+
# metadata.kv_tile_ids_per_batch,
136+
# metadata.kv_num_blocks,
137+
# metadata.decoder_batch_ids,
138+
# metadata.decoder_tile_ids_per_batch,
139+
# metadata.decoder_num_blocks,
140+
# metadata.max_len_kv,
141+
# metadata.set_max_lengths,
142+
# ) = get_block_shape_and_split_kv_block(
143+
# forward_meta.seq_lens_encoder,
144+
# forward_meta.seq_lens_decoder,
145+
# forward_meta.seq_lens_this_time,
146+
# forward_meta.cum_offsets,
147+
# metadata.encoder_block_shape_q,
148+
# metadata.decoder_block_shape_q,
149+
# self.num_heads // self.kv_num_heads,
150+
# self.block_size,
151+
# self.speculate_max_draft_token_num + 1,
152+
# )
153+
154+
# pd_disaggregation
155+
metadata.kv_signal_data_list = [None] * self.num_layers
156+
if self.use_pd_disaggregation:
157+
metadata.kv_signal_metadata = open_shm_and_get_meta_signal(
158+
self.rank, self.keep_pd_step_flag
159+
)
160+
self.attention_metadata = metadata
161+
162+
def get_attntion_meta(self):
163+
"""get_attntion_meta"""
164+
return self.attention_metadata
165+
166+
def get_kv_cache_shape(
167+
self,
168+
max_num_blocks: int,
169+
):
170+
"""
171+
Caculate kv cache shape
172+
"""
173+
return (max_num_blocks, self.kv_num_heads, self.block_size, self.head_dim)
174+
175+
def forward_mixed(
176+
self,
177+
q,
178+
k,
179+
v,
180+
qkv,
181+
layer: Attention,
182+
forward_meta,
183+
):
184+
"""
185+
forward_mixed
186+
"""
187+
metadata = self.attention_metadata
188+
189+
if self.use_pd_disaggregation:
190+
metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise(
191+
metadata.kv_signal_metadata, layer.layer_id + self.start_layer_index
192+
)
193+
# FIXME: guozr 这里改成bfloat16
194+
195+
196+
res = fused_fapa_attention_npu(
197+
qkv,
198+
metadata.rotary_embs,
199+
forward_meta.caches[2 * layer.layer_id],
200+
forward_meta.caches[2 * layer.layer_id + 1],
201+
forward_meta.seq_lens_encoder,
202+
forward_meta.seq_lens_decoder,
203+
metadata.block_tables,
204+
self.num_heads,
205+
self.kv_num_heads,
206+
self.head_dim,
207+
self.max_seq_len,
208+
self.block_size,
209+
)
210+
# res=paddle.randn([13,1024],dtype=paddle.bfloat16)
211+
# res=[res]
212+
return res[0]
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
npu backend methods
17+
"""
18+
from .quantization.weight_only import NPUWeightOnlyLinearMethod
19+
20+
__all__ = ['NPUWeightOnlyLinearMethod']
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
"""
2+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
17+
import paddle
18+
from fastdeploy.model_executor.layers.quantization.weight_only import (
19+
WeightOnlyConfig, WeightOnlyLinearMethod)
20+
from fastdeploy.model_executor.ops.npu import fused_linear_op as weight_only_linear
21+
from fastdeploy.model_executor.ops.npu import npu_quant_weight
22+
# import inspect
23+
24+
class NPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
25+
"""
26+
Weight only quantization method for linear layer on NPU
27+
"""
28+
29+
def __init__(
30+
self,
31+
quant_config: WeightOnlyConfig,
32+
) -> None:
33+
super().__init__(quant_config)
34+
35+
def create_weights(self, layer):
36+
"""
37+
Create weights for linear layer on NPU
38+
"""
39+
linear_weight_scale_shape = [layer.embed_dim]
40+
# 'qkv_proj', 'up_gate_proj', 'down_proj'
41+
# if layer.prefix.split('.')[-1] in ['qkv_proj']:
42+
# linear_weight_scale_shape = [layer.input_size]
43+
# else:
44+
# linear_weight_scale_shape = [layer.embed_dim]
45+
46+
if hasattr(layer, "linear_weight_shape"):
47+
if isinstance(layer.linear_weight_shape, list):
48+
layer_weight_shape = layer.linear_weight_shape
49+
linear_weight_scale_shape = layer_weight_shape[:1]
50+
# if layer.prefix.split('.')[-1] in ['qkv_proj']:
51+
# linear_weight_scale_shape = layer_weight_shape[1:]
52+
# else:
53+
# linear_weight_scale_shape = layer_weight_shape[:1]
54+
55+
# layer.linear_weight_quant = layer.create_parameter( # xy1
56+
# shape=[layer.embed_dim, layer.input_size],
57+
# dtype="int8",
58+
# is_bias=False,
59+
# )
60+
61+
layer.linear_weight_scale = layer.create_parameter(
62+
shape=linear_weight_scale_shape,
63+
dtype="bfloat16",
64+
is_bias=False,
65+
)
66+
67+
def process_loaded_weights(self, layer, weight) -> None:
68+
"""
69+
loaded_weights using npu special quantization
70+
"""
71+
# print(layer.prefix.split('.')[-1])
72+
# layer_weight_shape = layer.linear_weight_shape
73+
# print("layer_weight_shape: ", layer_weight_shape)
74+
75+
# if layer.prefix.split('.')[-1] in ['qkv_proj']:
76+
# quanted_weight_tensor, weight_scale_tensor = npu_quant_weight(weight.T)
77+
# else:
78+
# quanted_weight_tensor, weight_scale_tensor = npu_quant_weight(weight)
79+
80+
quanted_weight_tensor, weight_scale_tensor = npu_quant_weight(weight)
81+
layer.linear_weight.set_value(quanted_weight_tensor.T)
82+
# layer.linear_weight_quant.set_value(quanted_weight_tensor) #xy1
83+
layer.linear_weight_scale.set_value(
84+
weight_scale_tensor.astype(paddle.get_default_dtype())
85+
)
86+
87+
def apply(self, layer, x):
88+
# if layer.prefix.split('.')[-1] in ['qkv_proj']:
89+
# linear_out = weight_only_linear(
90+
# x,
91+
# weight=layer.linear_weight.T,
92+
# weight_scale=layer.linear_weight_scale,
93+
# )
94+
# else:
95+
# linear_out = weight_only_linear(
96+
# x,
97+
# weight=layer.linear_weight,
98+
# weight_scale=layer.linear_weight_scale,
99+
# )
100+
101+
linear_out = weight_only_linear(
102+
x,
103+
weight=layer.linear_weight.T,
104+
weight_scale=layer.linear_weight_scale,
105+
)
106+
return linear_out

0 commit comments

Comments
 (0)