Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,7 @@ function copy_ops(){
is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
if [ "$is_npu" = "True" ]; then
DEVICE_TYPE="npu"
cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/npu
echo -e "npu ops have been copy to fastdeploy"
echo -e "npu ops are already present in fastdeploy"
return
fi

Expand Down Expand Up @@ -153,6 +152,7 @@ function build_and_install_ops() {
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
if [ "$is_xpu" = "True" ]; then
cd xpu_ops/src
bash build.sh ${TMP_DIR_REAL_PATH}
Expand All @@ -164,6 +164,8 @@ function build_and_install_ops() {
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
fi
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
elif [ "$is_npu" = "True" ]; then
echo -e "${BLUE}[build]${NONE} skipping NPU ops build (already present)"
elif [ "$FD_CPU_USE_BF16" == "false" ]; then
if [ "$FD_BUILDING_ARCS" == "" ]; then
${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/model_executor/layers/activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def __init__(
or current_platform.is_maca()
):
self.forward = self.forward_cuda
elif current_platform.is_gcu():
elif current_platform.is_gcu() or current_platform.is_npu():
self.forward = self.forward_gcu
else:
raise NotImplementedError
Expand Down
2 changes: 2 additions & 0 deletions fastdeploy/model_executor/layers/attention/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .mla_attention_backend import MLAAttentionBackend
from .native_paddle_backend import PaddleNativeAttnBackend
from .xpu_attn_backend import XPUAttentionBackend
from .npu_fapa_attn_backend import NpuFaPaAttentionBackend

__all__ = [
"AttentionBackend",
Expand All @@ -34,4 +35,5 @@
"IluvatarAttnBackend",
"BlockAttentionBackend",
"Attention",
"NpuFaPaAttentionBackend"
]
212 changes: 212 additions & 0 deletions fastdeploy/model_executor/layers/attention/npu_fapa_attn_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

from __future__ import annotations

import os
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List, Optional
from paddle import core
from fastdeploy.config import FDConfig
import paddle
from fastdeploy.model_executor.layers.attention.ops import (
get_block_shape_and_split_kv_block, init_signal_layerwise,
open_shm_and_get_meta_signal)
from fastdeploy.model_executor.ops.npu import fused_fapa_attention_npu

if TYPE_CHECKING:
from paddle._typing.dtype_like import _DTypeLiteral

# from fastdeploy.config import LLMConfig
from fastdeploy.model_executor.layers.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata)


@dataclass
class NpuFaPaAttentionMetadata(AttentionMetadata):
"""
NpuFaPaAttentionMetadata
"""

max_len_kv: paddle.Tensor = None
set_max_lengths: int = -1
encoder_batch_ids: paddle.Tensor = None
encoder_tile_ids_per_batch: paddle.Tensor = None
encoder_num_blocks: paddle.Tensor = None
kv_batch_ids: paddle.Tensor = None
kv_tile_ids_per_batch: paddle.Tensor = None
kv_num_blocks: paddle.Tensor = None
decoder_batch_ids: paddle.Tensor = None
decoder_tile_ids_per_batch: paddle.Tensor = None
decoder_num_blocks: paddle.Tensor = None

_dtype: _DTypeLiteral = paddle.bfloat16
encoder_max_partition_size: int = 32768
max_partition_size: int = 32768
block_tables: Optional[paddle.Tensor] = None
rotary_embs: Optional[paddle.Tensor] = None
attn_mask: Optional[paddle.Tensor] = None
encoder_block_shape_q: Optional[paddle.Tensor] = None
decoder_block_shape_q: Optional[paddle.Tensor] = None
_fuse_kernel_compute_dtype: str = "bf16"

# pd_disaggregation
kv_signal_metadata: Optional[paddle.Tensor] = None
kv_signal_data_list: List[paddle.Tensor] = field(default_factory=list)


class NpuFaPaAttentionBackend(AttentionBackend):
"""
NpuFaPaAttentionBackend backend implementation.
"""

def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_dim: int):
"""
NpuFaPaAttentionBackend __init__
"""
super().__init__()
self.attention_metadata: NpuFaPaAttentionMetadata = None
# TODO(gongshaotian): Use fd_config parameters in the correct location
self.block_size = fd_config.parallel_config.block_size
self.max_seq_len = fd_config.parallel_config.max_model_len
self.rope_theta = (
10000.0
if fd_config.model_config.rope_theta is None
else fd_config.model_config.rope_theta
)
self.rope_3d = getattr(fd_config.model_config, "rope_3d", False)
self.causal = getattr(fd_config.model_config, "causal", True)
self.speculative_method: str = fd_config.speculative_config.method
self.use_speculate: bool = self.speculative_method is not None
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
self.rank = fd_config.parallel_config.tensor_parallel_rank

self.kv_num_heads = kv_num_heads
self.num_heads = num_heads
self.head_dim = head_dim
self.num_layers: int = fd_config.model_config.num_hidden_layers

# pd_disaggregation
self.use_pd_disaggregation = int(os.getenv("FLAGS_use_pd_disaggregation", 0))
self.start_layer_index = fd_config.model_config.start_layer_index

def init_attention_metadata(self, forward_meta):
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
metadata = NpuFaPaAttentionMetadata()
metadata.encoder_block_shape_q = 64
metadata.decoder_block_shape_q = 16
metadata.max_partition_size = 32768
metadata.encoder_max_partition_size = 32768
metadata._dtype = paddle.get_default_dtype()
if metadata._dtype == "bfloat16":
metadata._fuse_kernel_compute_dtype = "bf16"
elif metadata._dtype == "float16":
metadata._fuse_kernel_compute_dtype = "fp16"
elif metadata._dtype == "float32":
metadata._fuse_kernel_compute_dtype = "fp32"
metadata.block_tables = forward_meta.block_tables
metadata.rotary_embs = forward_meta.rotary_embs
metadata.attn_mask = forward_meta.attn_mask
metadata.pre_caches_length = forward_meta.pre_caches_length

# # FIXME:
# (
# metadata.encoder_batch_ids,
# metadata.encoder_tile_ids_per_batch,
# metadata.encoder_num_blocks,
# metadata.kv_batch_ids,
# metadata.kv_tile_ids_per_batch,
# metadata.kv_num_blocks,
# metadata.decoder_batch_ids,
# metadata.decoder_tile_ids_per_batch,
# metadata.decoder_num_blocks,
# metadata.max_len_kv,
# metadata.set_max_lengths,
# ) = get_block_shape_and_split_kv_block(
# forward_meta.seq_lens_encoder,
# forward_meta.seq_lens_decoder,
# forward_meta.seq_lens_this_time,
# forward_meta.cum_offsets,
# metadata.encoder_block_shape_q,
# metadata.decoder_block_shape_q,
# self.num_heads // self.kv_num_heads,
# self.block_size,
# self.speculate_max_draft_token_num + 1,
# )

# pd_disaggregation
metadata.kv_signal_data_list = [None] * self.num_layers
if self.use_pd_disaggregation:
metadata.kv_signal_metadata = open_shm_and_get_meta_signal(
self.rank, self.keep_pd_step_flag
)
self.attention_metadata = metadata

def get_attntion_meta(self):
"""get_attntion_meta"""
return self.attention_metadata

def get_kv_cache_shape(
self,
max_num_blocks: int,
kv_cache_quant_type: str = None,

):
"""
Caculate kv cache shape
"""
return (max_num_blocks, self.kv_num_heads, self.block_size, self.head_dim)

def forward_mixed(
self,
q,
k,
v,
qkv,
compressed_kv,
k_pe,
layer: Attention,
forward_meta,
):
"""
forward_mixed
"""
metadata = self.attention_metadata

if self.use_pd_disaggregation:
metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise(
metadata.kv_signal_metadata, layer.layer_id + self.start_layer_index
)
# FIXME: guozr 这里改成bfloat16


res = fused_fapa_attention_npu(
qkv,
metadata.rotary_embs,
forward_meta.caches[2 * layer.layer_id],
forward_meta.caches[2 * layer.layer_id + 1],
forward_meta.seq_lens_encoder,
forward_meta.seq_lens_decoder,
metadata.block_tables,
self.num_heads,
self.kv_num_heads,
self.head_dim,
self.max_seq_len,
self.block_size,
)
return res[0]
3 changes: 3 additions & 0 deletions fastdeploy/model_executor/layers/backends/npu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@
"""
npu backend methods
"""
from .quantization.weight_only import NPUWeightOnlyLinearMethod

__all__ = ['NPUWeightOnlyLinearMethod']
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

import paddle
from fastdeploy.model_executor.layers.quantization.weight_only import (
WeightOnlyConfig, WeightOnlyLinearMethod)
from fastdeploy.model_executor.ops.npu import fused_linear_op as weight_only_linear
from fastdeploy.model_executor.ops.npu import npu_quant_weight
# import inspect

class NPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
"""
Weight only quantization method for linear layer on NPU
"""

def __init__(
self,
quant_config: WeightOnlyConfig,
) -> None:
super().__init__(quant_config)

def create_weights(self, layer):
"""
Create weights for linear layer on NPU
"""

linear_weight_scale_shape = [layer.embed_dim]
if hasattr(layer, "linear_weight_shape"):
if isinstance(layer.linear_weight_shape, list):
layer_weight_shape = layer.linear_weight_shape
linear_weight_scale_shape = layer_weight_shape[:1]

layer.linear_weight_scale = layer.create_parameter(
shape=linear_weight_scale_shape,
dtype="bfloat16",
is_bias=False,
)

def process_loaded_weights(self, layer, weight) -> None:
"""
loaded_weights using npu special quantization
"""

quanted_weight_tensor, weight_scale_tensor = npu_quant_weight(weight)
layer.linear_weight.set_value(quanted_weight_tensor.T)
layer.linear_weight_scale.set_value(
weight_scale_tensor.astype(paddle.get_default_dtype())
)

def apply(self, layer, x):
linear_out = weight_only_linear(
x,
weight=layer.linear_weight.T,
weight_scale=layer.linear_weight_scale,
)
return linear_out
4 changes: 4 additions & 0 deletions fastdeploy/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def __init__(
or current_platform.is_gcu()
or current_platform.is_dcu()
or current_platform.is_maca()
or current_platform.is_npu()
):
self.forward = self.forward_cuda
else:
Expand Down Expand Up @@ -555,6 +556,9 @@ def load_weight(self, state_dict: dict):
if self.fd_config.quant_config:
self.quant_method.process_loaded_weights(self, weight_tensor)
else:
# Handle dtype conversion for NPU compatibility
if self.weight.dtype != weight_tensor.dtype: #FIXME: guozr 这里可能问题所在
weight_tensor = weight_tensor.cast(self.weight.dtype)
self.weight.set_value(weight_tensor)

def load_state_dict(self, state_dict: dict):
Expand Down
Loading