PaddlePaddle
diff --git a/‎build.sh
Lines changed: 10 additions & 0 deletions b/‎build.sh
Lines changed: 10 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/helper.h
Lines changed: 2 additions & 1 deletion b/‎custom_ops/gpu_ops/helper.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎custom_ops/setup_ops.py
Lines changed: 66 additions & 0 deletions b/‎custom_ops/setup_ops.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/forward_meta.py
Lines changed: 6 additions & 0 deletions b/‎fastdeploy/model_executor/forward_meta.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/activation.py
Lines changed: 1 addition & 0 deletions b/‎fastdeploy/model_executor/layers/activation.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/attention/base_attention_backend.py
Lines changed: 21 additions & 0 deletions b/‎fastdeploy/model_executor/layers/attention/base_attention_backend.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/backends/__init__.py
Lines changed: 7 additions & 0 deletions b/‎fastdeploy/model_executor/layers/backends/__init__.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/backends/metax/__init__.py
Lines changed: 21 additions & 0 deletions b/‎fastdeploy/model_executor/layers/backends/metax/__init__.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/backends/metax/attention/__init__.py
Lines changed: 30 additions & 0 deletions b/‎fastdeploy/model_executor/layers/backends/metax/attention/__init__.py
Lines changed: 30 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/backends/metax/attention/flash_attention_interface.py
Lines changed: 104 additions & 0 deletions b/‎fastdeploy/model_executor/layers/backends/metax/attention/flash_attention_interface.py
Lines changed: 104 additions & 0 deletions
@@ -126,6 +126,16 @@ function copy_ops(){
       return
     fi
 
+    is_maca=`$python -c "import paddle; print(paddle.device.is_compiled_with_custom_device('metax_gpu'))"`
+    if [ "$is_maca" = "True" ]; then
+      DEVICE_TYPE="metax_gpu"
+      mkdir -p ../fastdeploy/model_executor/ops/base
+      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
+      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
+      echo -e "MACA ops have been copy to fastdeploy"
+      return
+    fi
+
     DEVICE_TYPE="cpu"
     cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
     cd ../../../../
 
@@ -509,6 +509,7 @@ static void PrintMatrix3(const T *mat_d, int num, std::string name) {
 }
 
 #ifndef PADDLE_WITH_HIP
+#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
 __forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr,
                                                     int mode = 0) {
   uint32_t flag;
@@ -541,7 +542,7 @@ __forceinline__ __device__ void st_flag_release(uint32_t *flag_addr,
                  "l"(flag_addr));
   }
 }
-
+#endif
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
   int max_shared_mem_per_block_opt_in = 0;
   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
 
@@ -564,6 +564,72 @@ def find_end_files(directory, end_str):
             ]
         ),
     )
+elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
+    maca_path = os.getenv("MACA_PATH", "/opt/maca")
+    json_dir = "third_party/nlohmann_json"
+    if not os.path.exists(json_dir) or not os.listdir(json_dir):
+        if not os.path.exists(json_dir):
+            os.makedirs(json_dir)
+        clone_git_repo("v3.11.3", "https://gitee.com/learnlov/mirrors_nlohmann_json.git", json_dir)
+        if not os.listdir(json_dir):
+            raise ValueError("Git clone nlohmann_json failed!")
+    sources = [
+        "gpu_ops/save_with_output.cc",
+        "gpu_ops/set_mask_value.cu",
+        "gpu_ops/set_value_by_flags.cu",
+        "gpu_ops/ngram_mask.cu",
+        "gpu_ops/gather_idx.cu",
+        "gpu_ops/get_output_ep.cc",
+        "gpu_ops/token_penalty_multi_scores.cu",
+        "gpu_ops/token_penalty_only_once.cu",
+        "gpu_ops/stop_generation.cu",
+        "gpu_ops/stop_generation_multi_ends.cu",
+        "gpu_ops/set_flags.cu",
+        "gpu_ops/fused_get_rope.cu",
+        "gpu_ops/get_padding_offset.cu",
+        "gpu_ops/update_inputs.cu",
+        "gpu_ops/update_inputs_beam.cu",
+        "gpu_ops/beam_search_softmax.cu",
+        "gpu_ops/rebuild_padding.cu",
+        "gpu_ops/step.cu",
+        "gpu_ops/step_reschedule.cu",
+        "gpu_ops/step_system_cache.cu",
+        "gpu_ops/set_data_ipc.cu",
+        "gpu_ops/read_data_ipc.cu",
+        "gpu_ops/dequant_int8.cu",
+        "gpu_ops/share_external_data.cu",
+        "gpu_ops/extract_text_token_output.cu",
+        "gpu_ops/moe/tritonmoe_preprocess.cu",
+        "gpu_ops/moe/moe_topk_select.cu",
+        "gpu_ops/recover_decode_task.cu",
+    ]
+
+    sources += find_end_files("gpu_ops/speculate_decoding", ".cu")
+    sources += find_end_files("gpu_ops/speculate_decoding", ".cc")
+
+    setup(
+        name="fastdeploy_ops",
+        ext_modules=CUDAExtension(
+            sources=sources,
+            extra_compile_args={
+                "cxx": ["-O3"],
+                "nvcc": [
+                    "-O3",
+                    "-Ithird_party/nlohmann_json/include",
+                    "-Igpu_ops",
+                    "-DPADDLE_DEV",
+                    "-DPADDLE_WITH_CUSTOM_DEVICE_METAX_GPU",
+                ],
+            },
+            library_dirs=[os.path.join(maca_path, "lib")],
+            extra_link_args=["-lruntime_cu"],
+            include_dirs=[
+                os.path.join(maca_path, "include"),
+                os.path.join(maca_path, "include/mcr"),
+                os.path.join(maca_path, "include/common"),
+            ],
+        ),
+    )
 else:
     use_bf16 = envs.FD_CPU_USE_BF16 == "True"
 
 
@@ -37,6 +37,8 @@ class ForwardMode(IntEnum):
     DECODE = auto()
     # Mixed mode
     MIXED = auto()
+    # Native mode
+    NATIVE = auto()
 
     def is_prefill(self):
         """Is Extend mode"""
@@ -50,6 +52,10 @@ def is_mixed(self):
         """Is Mixed mode"""
         return self == ForwardMode.MIXED
 
+    def is_native(self):
+        """Is Native mode"""
+        return self == ForwardMode.NATIVE
+
 
 @dataclass
 class ForwardMeta:
 
@@ -68,6 +68,7 @@ def __init__(
             or current_platform.is_xpu()
             or current_platform.is_iluvatar()
             or current_platform.is_dcu()
+            or current_platform.is_maca()
         ):
             self.forward = self.forward_cuda
         elif current_platform.is_gcu():
 
@@ -86,6 +86,15 @@ def forward(
                 layer,
                 forward_meta,
             )
+        elif forward_meta.forward_mode.is_native():
+            return self.forward_native_backend(
+                q,
+                k,
+                v,
+                qkv,
+                layer,
+                forward_meta,
+            )
         else:
             return self.forward_extend(
                 q,
@@ -139,3 +148,15 @@ def forward_extend(
     ) -> paddle.Tensor:
         """Run a forward for extend."""
         raise NotImplementedError
+
+    def forward_native_backend(
+        self,
+        q: paddle.Tensor,
+        k: paddle.Tensor,
+        v: paddle.Tensor,
+        qkv: paddle.Tensor,
+        layer: paddle.nn.Layer,
+        forward_meta: ForwardMeta,
+    ) -> paddle.Tensor:
+        """Run a forward for native."""
+        raise NotImplementedError
@@ -48,3 +48,10 @@
     if hasattr(dcu, "__all__"):
         globals().update({name: getattr(dcu, name) for name in dcu.__all__})
         __all__.extend(dcu.__all__)
+
+if current_platform.is_maca():
+    from . import metax
+
+    if hasattr(metax, "__all__"):
+        globals().update({name: getattr(metax, name) for name in metax.__all__})
+        __all__.extend(metax.__all__)
@@ -0,0 +1,21 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .attention.flash_attn_backend import FlashAttentionBackend
+from .moe.fused_moe_triton_metax_backend import MetaxTritonWeightOnlyMoEMethod
+
+__all__ = [
+    "FlashAttentionBackend",
+    "MetaxTritonWeightOnlyMoEMethod",
+]
@@ -0,0 +1,30 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+metax gpu backend  attention methods
+"""
+from .flash_attention_interface import (
+    flash_attn_func,
+    flash_attn_kvcache_func,
+    flash_attn_unpadded_func,
+)
+from .flash_attn_backend import FlashAttentionBackend
+
+__all__ = [
+    "FlashAttentionBackend",
+    "flash_attn_func",
+    "flash_attn_unpadded_func",
+    "flash_attn_kvcache_func",
+]
@@ -0,0 +1,104 @@
+import os
+from typing import Optional, Tuple, Union
+
+import paddle
+from paddle import Tensor
+
+for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
+    if lib.endswith(".so"):
+        paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
+
+
+def flash_attn_func(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    fixed_seed_offset: Optional[Tensor] = None,
+    attn_mask: Optional[Tensor] = None,
+    dropout_prob: float = 0.0,
+    causal: bool = False,
+    return_softmax: bool = False,
+    is_test: bool = True,
+    rng_name: str = "",
+) -> Union[Tensor, Tuple[Tensor, ...]]:
+    return paddle._C_ops.flash_attn(
+        q, k, v, fixed_seed_offset, attn_mask, dropout_prob, causal, return_softmax, is_test, rng_name
+    )
+
+
+def flash_attn_unpadded_func(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    cu_seqlens_q: Tensor,
+    cu_seqlens_k: Tensor,
+    max_seqlen_q: Union[int, float],
+    max_seqlen_k: Union[int, float],
+    fixed_seed_offset: Optional[Tensor] = None,
+    attn_mask: Optional[Tensor] = None,
+    softmax_scale: float = 1.0,
+    dropout: float = 0.0,
+    causal: bool = False,
+    return_softmax: bool = False,
+    is_test: bool = True,
+    rng_name: str = "",
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    max_seqlen_q_t = paddle.to_tensor(max_seqlen_q, dtype="int64")
+    max_seqlen_k_t = paddle.to_tensor(max_seqlen_k, dtype="int64")
+
+    outputs = paddle._C_ops.flash_attn_unpadded(
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        fixed_seed_offset,
+        attn_mask,
+        max_seqlen_q_t,
+        max_seqlen_k_t,
+        softmax_scale,
+        dropout,
+        causal,
+        return_softmax,
+        is_test,
+        rng_name,
+    )
+    return outputs
+
+
+def flash_attn_kvcache_func(
+    q: Tensor,
+    k_cache: Tensor,
+    v_cache: Tensor,
+    seqlens_k: Tensor,
+    block_table: Tensor,
+    k: Optional[Tensor] = None,
+    v: Optional[Tensor] = None,
+    rotary_cos: Optional[Tensor] = None,
+    rotary_sin: Optional[Tensor] = None,
+    cache_batch_idx: Optional[Tensor] = None,
+    causal: bool = True,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+    dropout: float = 0.0,
+    return_softmax: bool = False,
+) -> Tuple[Tensor, Tensor]:
+    out, softmax_lse = paddle._C_ops._run_custom_op(
+        "flash_attn_kvcache",
+        q,
+        k_cache,
+        v_cache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        block_table,
+        causal,
+        is_rotary_interleaved,
+        num_splits,
+        dropout,
+        return_softmax,
+    )
+    return out, softmax_lse
Original file line number	Diff line number	Diff line change
`@@ -509,6 +509,7 @@ static void PrintMatrix3(const T *mat_d, int num, std::string name) {`
`509`	`509`	`}`
`510`	`510`
`511`	`511`	`#ifndef PADDLE_WITH_HIP`
	`512`	`+#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU`
`512`	`513`	`__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr,`
`513`	`514`	`int mode = 0) {`
`514`	`515`	`uint32_t flag;`
`@@ -541,7 +542,7 @@ __forceinline__ __device__ void st_flag_release(uint32_t *flag_addr,`
`541`	`542`	`"l"(flag_addr));`
`542`	`543`	`}`
`543`	`544`	`}`
`544`		`-`
	`545`	`+#endif`
`545`	`546`	`inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {`
`546`	`547`	`int max_shared_mem_per_block_opt_in = 0;`
`547`	`548`	`cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,`