PaddlePaddle
diff --git a/‎build.sh
Lines changed: 4 additions & 2 deletions b/‎build.sh
Lines changed: 4 additions & 2 deletions
diff --git a/‎fastdeploy/model_executor/layers/activation.py
Lines changed: 3 additions & 1 deletion b/‎fastdeploy/model_executor/layers/activation.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎fastdeploy/model_executor/layers/attention/__init__.py
Lines changed: 2 additions & 0 deletions b/‎fastdeploy/model_executor/layers/attention/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/attention/npu_fapa_attn_backend.py
Lines changed: 212 additions & 0 deletions b/‎fastdeploy/model_executor/layers/attention/npu_fapa_attn_backend.py
Lines changed: 212 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/backends/npu/quantization/__init__.py
Lines changed: 20 additions & 0 deletions b/‎fastdeploy/model_executor/layers/backends/npu/quantization/__init__.py
Lines changed: 20 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/backends/npu/quantization/weight_only.py
Lines changed: 106 additions & 0 deletions b/‎fastdeploy/model_executor/layers/backends/npu/quantization/weight_only.py
Lines changed: 106 additions & 0 deletions
@@ -104,8 +104,7 @@ function copy_ops(){
     is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
     if [ "$is_npu" = "True" ]; then
       DEVICE_TYPE="npu"
-      cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/npu
-      echo -e "npu ops have been copy to fastdeploy"
+      echo -e "npu ops are already present in fastdeploy"
       return
     fi
 
@@ -153,6 +152,7 @@ function build_and_install_ops() {
   echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
   TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
   is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
+  is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
   if [ "$is_xpu" = "True" ]; then
     cd xpu_ops/src
     bash build.sh ${TMP_DIR_REAL_PATH}
@@ -164,6 +164,8 @@ function build_and_install_ops() {
       FD_BUILDING_ARCS=${FD_BUILDING_ARCS} FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
     fi
     find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
+  elif [ "$is_npu" = "True" ]; then
+    echo -e "${BLUE}[build]${NONE} skipping NPU ops build (already present)"
   elif [ "$FD_CPU_USE_BF16" == "false" ]; then
     if [ "$FD_BUILDING_ARCS" == "" ]; then
       ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
 
@@ -71,7 +71,7 @@ def __init__(
             or current_platform.is_maca()
         ):
             self.forward = self.forward_cuda
-        elif current_platform.is_gcu():
+        elif current_platform.is_gcu() or current_platform.is_npu():
             self.forward = self.forward_gcu
         else:
             raise NotImplementedError
@@ -147,3 +147,5 @@ def forward_gcu(self, x):
         if self.bias is not None:
             out = out + self.bias
         return out
+    
+   
@@ -22,6 +22,7 @@
 from .mla_attention_backend import MLAAttentionBackend
 from .native_paddle_backend import PaddleNativeAttnBackend
 from .xpu_attn_backend import XPUAttentionBackend
+from .npu_fapa_attn_backend import NpuFaPaAttentionBackend
 
 __all__ = [
     "AttentionBackend",
@@ -34,4 +35,5 @@
     "IluvatarAttnBackend",
     "BlockAttentionBackend",
     "Attention",
+    "NpuFaPaAttentionBackend"
 ]
@@ -0,0 +1,212 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, List, Optional
+from paddle import core
+
+import paddle
+from fastdeploy.model_executor.layers.attention.ops import (
+    get_block_shape_and_split_kv_block, init_signal_layerwise,
+    open_shm_and_get_meta_signal)
+from fastdeploy.model_executor.ops.npu import fused_fapa_attention_npu
+
+if TYPE_CHECKING:
+    from paddle._typing.dtype_like import _DTypeLiteral
+
+# from fastdeploy.config import LLMConfig
+from fastdeploy.model_executor.layers.attention import Attention
+from fastdeploy.model_executor.layers.attention.base_attention_backend import (
+    AttentionBackend, AttentionMetadata)
+
+
+@dataclass
+class NpuFaPaAttentionMetadata(AttentionMetadata):
+    """
+    NpuFaPaAttentionMetadata
+    """
+
+    max_len_kv: paddle.Tensor = None
+    set_max_lengths: int = -1
+    encoder_batch_ids: paddle.Tensor = None
+    encoder_tile_ids_per_batch: paddle.Tensor = None
+    encoder_num_blocks: paddle.Tensor = None
+    kv_batch_ids: paddle.Tensor = None
+    kv_tile_ids_per_batch: paddle.Tensor = None
+    kv_num_blocks: paddle.Tensor = None
+    decoder_batch_ids: paddle.Tensor = None
+    decoder_tile_ids_per_batch: paddle.Tensor = None
+    decoder_num_blocks: paddle.Tensor = None
+
+    _dtype: _DTypeLiteral = paddle.bfloat16
+    encoder_max_partition_size: int = 32768
+    max_partition_size: int = 32768
+    block_tables: Optional[paddle.Tensor] = None
+    rotary_embs: Optional[paddle.Tensor] = None
+    attn_mask: Optional[paddle.Tensor] = None
+    encoder_block_shape_q: Optional[paddle.Tensor] = None
+    decoder_block_shape_q: Optional[paddle.Tensor] = None
+    _fuse_kernel_compute_dtype: str = "bf16"
+
+    # pd_disaggregation
+    kv_signal_metadata: Optional[paddle.Tensor] = None
+    kv_signal_data_list: List[paddle.Tensor] = field(default_factory=list)
+
+
+class NpuFaPaAttentionBackend(AttentionBackend):
+    """
+    NpuFaPaAttentionBackend backend implementation.
+    """
+
+    def __init__(self, llm_config, kv_num_heads: int, num_heads: int, head_dim: int):
+        """
+        NpuFaPaAttentionBackend __init__
+        """
+        super().__init__()
+        self.attention_metadata: NpuFaPaAttentionMetadata = None
+        # TODO(gongshaotian): Use llm_config parameters in the correct location
+        self.block_size = llm_config.parallel_config.block_size
+        self.max_seq_len = llm_config.parallel_config.max_model_len
+        self.rope_theta = (
+            10000.0
+            if llm_config.model_config.rope_theta is None
+            else llm_config.model_config.rope_theta
+        )
+        self.rope_3d = getattr(llm_config.model_config, "rope_3d", False)
+        self.causal = getattr(llm_config.model_config, "causal", True)
+        self.speculate_method = llm_config.parallel_config.speculate_method
+        self.use_speculate = self.speculate_method is not None
+        self.speculate_max_draft_token_num = (
+            llm_config.parallel_config.speculate_max_draft_tokens
+        )
+        self.keep_pd_step_flag = llm_config.speculative_config.is_mtp
+        self.rank = llm_config.parallel_config.tensor_parallel_rank
+
+        self.kv_num_heads = kv_num_heads
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_layers = llm_config.model_config.num_layers
+
+        # pd_disaggregation
+        self.use_pd_disaggregation = int(os.getenv("FLAGS_use_pd_disaggregation", 0))
+        self.start_layer_index = llm_config.model_config.start_layer_index
+
+    def init_attention_metadata(self, forward_meta):
+        """Initialize attntion metadata hence all layers in the forward pass can reuse it."""
+        metadata = NpuFaPaAttentionMetadata()
+        metadata.encoder_block_shape_q = 64
+        metadata.decoder_block_shape_q = 16
+        metadata.max_partition_size = 32768
+        metadata.encoder_max_partition_size = 32768
+        metadata._dtype = paddle.get_default_dtype()
+        if metadata._dtype == "bfloat16":
+            metadata._fuse_kernel_compute_dtype = "bf16"
+        elif metadata._dtype == "float16":
+            metadata._fuse_kernel_compute_dtype = "fp16"
+        elif metadata._dtype == "float32":
+            metadata._fuse_kernel_compute_dtype = "fp32"
+        metadata.block_tables = forward_meta.block_tables
+        metadata.rotary_embs = forward_meta.rotary_embs
+        metadata.attn_mask = forward_meta.attn_mask
+        metadata.pre_caches_length = forward_meta.pre_caches_length
+
+        # # FIXME:
+        # (
+        #     metadata.encoder_batch_ids,
+        #     metadata.encoder_tile_ids_per_batch,
+        #     metadata.encoder_num_blocks,
+        #     metadata.kv_batch_ids,
+        #     metadata.kv_tile_ids_per_batch,
+        #     metadata.kv_num_blocks,
+        #     metadata.decoder_batch_ids,
+        #     metadata.decoder_tile_ids_per_batch,
+        #     metadata.decoder_num_blocks,
+        #     metadata.max_len_kv,
+        #     metadata.set_max_lengths,
+        # ) = get_block_shape_and_split_kv_block(
+        #     forward_meta.seq_lens_encoder,
+        #     forward_meta.seq_lens_decoder,
+        #     forward_meta.seq_lens_this_time,
+        #     forward_meta.cum_offsets,
+        #     metadata.encoder_block_shape_q,
+        #     metadata.decoder_block_shape_q,
+        #     self.num_heads // self.kv_num_heads,
+        #     self.block_size,
+        #     self.speculate_max_draft_token_num + 1,
+        # )
+
+        # pd_disaggregation
+        metadata.kv_signal_data_list = [None] * self.num_layers
+        if self.use_pd_disaggregation:
+            metadata.kv_signal_metadata = open_shm_and_get_meta_signal(
+                self.rank, self.keep_pd_step_flag
+            )
+        self.attention_metadata = metadata
+
+    def get_attntion_meta(self):
+        """get_attntion_meta"""
+        return self.attention_metadata
+
+    def get_kv_cache_shape(
+        self,
+        max_num_blocks: int,
+    ):
+        """
+        Caculate kv cache shape
+        """
+        return (max_num_blocks, self.kv_num_heads, self.block_size, self.head_dim)
+
+    def forward_mixed(
+        self,
+        q,
+        k,
+        v,
+        qkv,
+        layer: Attention,
+        forward_meta,
+    ):
+        """
+        forward_mixed
+        """
+        metadata = self.attention_metadata
+
+        if self.use_pd_disaggregation:
+            metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise(
+                metadata.kv_signal_metadata, layer.layer_id + self.start_layer_index
+            )
+        # FIXME: guozr 这里改成bfloat16
+
+        
+        res = fused_fapa_attention_npu(
+            qkv,  
+            metadata.rotary_embs,
+            forward_meta.caches[2 * layer.layer_id],
+            forward_meta.caches[2 * layer.layer_id + 1],
+            forward_meta.seq_lens_encoder,
+            forward_meta.seq_lens_decoder,
+            metadata.block_tables,
+            self.num_heads,
+            self.kv_num_heads,
+            self.head_dim,
+            self.max_seq_len,
+            self.block_size,
+        )
+        # res=paddle.randn([13,1024],dtype=paddle.bfloat16)
+        # res=[res]
+        return res[0]
@@ -0,0 +1,20 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+npu backend methods
+"""
+from .quantization.weight_only import NPUWeightOnlyLinearMethod
+
+__all__ = ['NPUWeightOnlyLinearMethod']
@@ -0,0 +1,106 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from fastdeploy.model_executor.layers.quantization.weight_only import (
+    WeightOnlyConfig, WeightOnlyLinearMethod)
+from fastdeploy.model_executor.ops.npu import fused_linear_op as weight_only_linear
+from fastdeploy.model_executor.ops.npu import npu_quant_weight
+# import inspect
+
+class NPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
+    """
+    Weight only quantization method for linear layer on NPU
+    """
+
+    def __init__(
+        self,
+        quant_config: WeightOnlyConfig,
+    ) -> None:
+        super().__init__(quant_config)
+
+    def create_weights(self, layer):
+        """
+        Create weights for linear layer on NPU
+        """
+        linear_weight_scale_shape = [layer.embed_dim]
+        # 'qkv_proj', 'up_gate_proj', 'down_proj'
+        # if layer.prefix.split('.')[-1] in ['qkv_proj']:
+        #     linear_weight_scale_shape = [layer.input_size]
+        # else:
+        #     linear_weight_scale_shape = [layer.embed_dim]
+        
+        if hasattr(layer, "linear_weight_shape"):
+            if isinstance(layer.linear_weight_shape, list):
+                layer_weight_shape = layer.linear_weight_shape
+                linear_weight_scale_shape = layer_weight_shape[:1]
+                # if layer.prefix.split('.')[-1] in ['qkv_proj']:
+                #     linear_weight_scale_shape = layer_weight_shape[1:]
+                # else:
+                #     linear_weight_scale_shape = layer_weight_shape[:1]                    
+
+        # layer.linear_weight_quant = layer.create_parameter(  # xy1
+        #     shape=[layer.embed_dim, layer.input_size],
+        #     dtype="int8",
+        #     is_bias=False,
+        # )
+
+        layer.linear_weight_scale = layer.create_parameter(
+            shape=linear_weight_scale_shape,
+            dtype="bfloat16",
+            is_bias=False,
+        )
+
+    def process_loaded_weights(self, layer, weight) -> None:
+        """
+        loaded_weights using npu special quantization
+        """
+        # print(layer.prefix.split('.')[-1])
+        # layer_weight_shape = layer.linear_weight_shape
+        # print("layer_weight_shape: ", layer_weight_shape)
+
+        # if layer.prefix.split('.')[-1] in ['qkv_proj']:
+        #     quanted_weight_tensor, weight_scale_tensor = npu_quant_weight(weight.T)
+        # else:
+        #     quanted_weight_tensor, weight_scale_tensor = npu_quant_weight(weight)
+
+        quanted_weight_tensor, weight_scale_tensor = npu_quant_weight(weight)
+        layer.linear_weight.set_value(quanted_weight_tensor.T)  
+        # layer.linear_weight_quant.set_value(quanted_weight_tensor) #xy1
+        layer.linear_weight_scale.set_value(
+            weight_scale_tensor.astype(paddle.get_default_dtype())
+        )
+
+    def apply(self, layer, x):
+        # if layer.prefix.split('.')[-1] in ['qkv_proj']:
+        #     linear_out = weight_only_linear(
+        #         x,
+        #         weight=layer.linear_weight.T,
+        #         weight_scale=layer.linear_weight_scale,
+        #     )
+        # else:
+        #     linear_out = weight_only_linear(
+        #         x,
+        #         weight=layer.linear_weight,
+        #         weight_scale=layer.linear_weight_scale,
+        #     )
+
+        linear_out = weight_only_linear(
+                x,
+                weight=layer.linear_weight.T,
+                weight_scale=layer.linear_weight_scale,
+            )
+        return linear_out