Skip to content

Commit b4fef2c

Browse files
[MetaxGPU] Support FastDeploy on metax gpu (#3241)
* [MetaxGPU] Support FastDeploy on metax gpu * Update metax_worker.py 1. change worker log; 2. remove custom allreduce, adapt it later; 3. remove cuda graph; * Update __init__.py 1. remove metax's key work comment * Update __init__.py 1. remove metax's key word comment; 2. add fused_moe_kernel_paddle import --------- Co-authored-by: yongqiangma <[email protected]>
1 parent ed6bff2 commit b4fef2c

29 files changed

+3224
-11
lines changed

build.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,16 @@ function copy_ops(){
126126
return
127127
fi
128128

129+
is_maca=`$python -c "import paddle; print(paddle.device.is_compiled_with_custom_device('metax_gpu'))"`
130+
if [ "$is_maca" = "True" ]; then
131+
DEVICE_TYPE="metax_gpu"
132+
mkdir -p ../fastdeploy/model_executor/ops/base
133+
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
134+
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
135+
echo -e "MACA ops have been copy to fastdeploy"
136+
return
137+
fi
138+
129139
DEVICE_TYPE="cpu"
130140
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
131141
cd ../../../../

custom_ops/gpu_ops/helper.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,7 @@ static void PrintMatrix3(const T *mat_d, int num, std::string name) {
509509
}
510510

511511
#ifndef PADDLE_WITH_HIP
512+
#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
512513
__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr,
513514
int mode = 0) {
514515
uint32_t flag;
@@ -541,7 +542,7 @@ __forceinline__ __device__ void st_flag_release(uint32_t *flag_addr,
541542
"l"(flag_addr));
542543
}
543544
}
544-
545+
#endif
545546
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
546547
int max_shared_mem_per_block_opt_in = 0;
547548
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,

custom_ops/setup_ops.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,72 @@ def find_end_files(directory, end_str):
564564
]
565565
),
566566
)
567+
elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
568+
maca_path = os.getenv("MACA_PATH", "/opt/maca")
569+
json_dir = "third_party/nlohmann_json"
570+
if not os.path.exists(json_dir) or not os.listdir(json_dir):
571+
if not os.path.exists(json_dir):
572+
os.makedirs(json_dir)
573+
clone_git_repo("v3.11.3", "https://gitee.com/learnlov/mirrors_nlohmann_json.git", json_dir)
574+
if not os.listdir(json_dir):
575+
raise ValueError("Git clone nlohmann_json failed!")
576+
sources = [
577+
"gpu_ops/save_with_output.cc",
578+
"gpu_ops/set_mask_value.cu",
579+
"gpu_ops/set_value_by_flags.cu",
580+
"gpu_ops/ngram_mask.cu",
581+
"gpu_ops/gather_idx.cu",
582+
"gpu_ops/get_output_ep.cc",
583+
"gpu_ops/token_penalty_multi_scores.cu",
584+
"gpu_ops/token_penalty_only_once.cu",
585+
"gpu_ops/stop_generation.cu",
586+
"gpu_ops/stop_generation_multi_ends.cu",
587+
"gpu_ops/set_flags.cu",
588+
"gpu_ops/fused_get_rope.cu",
589+
"gpu_ops/get_padding_offset.cu",
590+
"gpu_ops/update_inputs.cu",
591+
"gpu_ops/update_inputs_beam.cu",
592+
"gpu_ops/beam_search_softmax.cu",
593+
"gpu_ops/rebuild_padding.cu",
594+
"gpu_ops/step.cu",
595+
"gpu_ops/step_reschedule.cu",
596+
"gpu_ops/step_system_cache.cu",
597+
"gpu_ops/set_data_ipc.cu",
598+
"gpu_ops/read_data_ipc.cu",
599+
"gpu_ops/dequant_int8.cu",
600+
"gpu_ops/share_external_data.cu",
601+
"gpu_ops/extract_text_token_output.cu",
602+
"gpu_ops/moe/tritonmoe_preprocess.cu",
603+
"gpu_ops/moe/moe_topk_select.cu",
604+
"gpu_ops/recover_decode_task.cu",
605+
]
606+
607+
sources += find_end_files("gpu_ops/speculate_decoding", ".cu")
608+
sources += find_end_files("gpu_ops/speculate_decoding", ".cc")
609+
610+
setup(
611+
name="fastdeploy_ops",
612+
ext_modules=CUDAExtension(
613+
sources=sources,
614+
extra_compile_args={
615+
"cxx": ["-O3"],
616+
"nvcc": [
617+
"-O3",
618+
"-Ithird_party/nlohmann_json/include",
619+
"-Igpu_ops",
620+
"-DPADDLE_DEV",
621+
"-DPADDLE_WITH_CUSTOM_DEVICE_METAX_GPU",
622+
],
623+
},
624+
library_dirs=[os.path.join(maca_path, "lib")],
625+
extra_link_args=["-lruntime_cu"],
626+
include_dirs=[
627+
os.path.join(maca_path, "include"),
628+
os.path.join(maca_path, "include/mcr"),
629+
os.path.join(maca_path, "include/common"),
630+
],
631+
),
632+
)
567633
else:
568634
use_bf16 = envs.FD_CPU_USE_BF16 == "True"
569635

fastdeploy/model_executor/forward_meta.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class ForwardMode(IntEnum):
3737
DECODE = auto()
3838
# Mixed mode
3939
MIXED = auto()
40+
# Native mode
41+
NATIVE = auto()
4042

4143
def is_prefill(self):
4244
"""Is Extend mode"""
@@ -50,6 +52,10 @@ def is_mixed(self):
5052
"""Is Mixed mode"""
5153
return self == ForwardMode.MIXED
5254

55+
def is_native(self):
56+
"""Is Native mode"""
57+
return self == ForwardMode.NATIVE
58+
5359

5460
@dataclass
5561
class ForwardMeta:

fastdeploy/model_executor/layers/activation.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def __init__(
6868
or current_platform.is_xpu()
6969
or current_platform.is_iluvatar()
7070
or current_platform.is_dcu()
71+
or current_platform.is_maca()
7172
):
7273
self.forward = self.forward_cuda
7374
elif current_platform.is_gcu():

fastdeploy/model_executor/layers/attention/base_attention_backend.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,15 @@ def forward(
8686
layer,
8787
forward_meta,
8888
)
89+
elif forward_meta.forward_mode.is_native():
90+
return self.forward_native_backend(
91+
q,
92+
k,
93+
v,
94+
qkv,
95+
layer,
96+
forward_meta,
97+
)
8998
else:
9099
return self.forward_extend(
91100
q,
@@ -139,3 +148,15 @@ def forward_extend(
139148
) -> paddle.Tensor:
140149
"""Run a forward for extend."""
141150
raise NotImplementedError
151+
152+
def forward_native_backend(
153+
self,
154+
q: paddle.Tensor,
155+
k: paddle.Tensor,
156+
v: paddle.Tensor,
157+
qkv: paddle.Tensor,
158+
layer: paddle.nn.Layer,
159+
forward_meta: ForwardMeta,
160+
) -> paddle.Tensor:
161+
"""Run a forward for native."""
162+
raise NotImplementedError

fastdeploy/model_executor/layers/backends/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,10 @@
4848
if hasattr(dcu, "__all__"):
4949
globals().update({name: getattr(dcu, name) for name in dcu.__all__})
5050
__all__.extend(dcu.__all__)
51+
52+
if current_platform.is_maca():
53+
from . import metax
54+
55+
if hasattr(metax, "__all__"):
56+
globals().update({name: getattr(metax, name) for name in metax.__all__})
57+
__all__.extend(metax.__all__)
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from .attention.flash_attn_backend import FlashAttentionBackend
16+
from .moe.fused_moe_triton_metax_backend import MetaxTritonWeightOnlyMoEMethod
17+
18+
__all__ = [
19+
"FlashAttentionBackend",
20+
"MetaxTritonWeightOnlyMoEMethod",
21+
]
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
metax gpu backend attention methods
17+
"""
18+
from .flash_attention_interface import (
19+
flash_attn_func,
20+
flash_attn_kvcache_func,
21+
flash_attn_unpadded_func,
22+
)
23+
from .flash_attn_backend import FlashAttentionBackend
24+
25+
__all__ = [
26+
"FlashAttentionBackend",
27+
"flash_attn_func",
28+
"flash_attn_unpadded_func",
29+
"flash_attn_kvcache_func",
30+
]
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import os
2+
from typing import Optional, Tuple, Union
3+
4+
import paddle
5+
from paddle import Tensor
6+
7+
for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
8+
if lib.endswith(".so"):
9+
paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
10+
11+
12+
def flash_attn_func(
13+
q: Tensor,
14+
k: Tensor,
15+
v: Tensor,
16+
fixed_seed_offset: Optional[Tensor] = None,
17+
attn_mask: Optional[Tensor] = None,
18+
dropout_prob: float = 0.0,
19+
causal: bool = False,
20+
return_softmax: bool = False,
21+
is_test: bool = True,
22+
rng_name: str = "",
23+
) -> Union[Tensor, Tuple[Tensor, ...]]:
24+
return paddle._C_ops.flash_attn(
25+
q, k, v, fixed_seed_offset, attn_mask, dropout_prob, causal, return_softmax, is_test, rng_name
26+
)
27+
28+
29+
def flash_attn_unpadded_func(
30+
q: Tensor,
31+
k: Tensor,
32+
v: Tensor,
33+
cu_seqlens_q: Tensor,
34+
cu_seqlens_k: Tensor,
35+
max_seqlen_q: Union[int, float],
36+
max_seqlen_k: Union[int, float],
37+
fixed_seed_offset: Optional[Tensor] = None,
38+
attn_mask: Optional[Tensor] = None,
39+
softmax_scale: float = 1.0,
40+
dropout: float = 0.0,
41+
causal: bool = False,
42+
return_softmax: bool = False,
43+
is_test: bool = True,
44+
rng_name: str = "",
45+
) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
46+
max_seqlen_q_t = paddle.to_tensor(max_seqlen_q, dtype="int64")
47+
max_seqlen_k_t = paddle.to_tensor(max_seqlen_k, dtype="int64")
48+
49+
outputs = paddle._C_ops.flash_attn_unpadded(
50+
q,
51+
k,
52+
v,
53+
cu_seqlens_q,
54+
cu_seqlens_k,
55+
fixed_seed_offset,
56+
attn_mask,
57+
max_seqlen_q_t,
58+
max_seqlen_k_t,
59+
softmax_scale,
60+
dropout,
61+
causal,
62+
return_softmax,
63+
is_test,
64+
rng_name,
65+
)
66+
return outputs
67+
68+
69+
def flash_attn_kvcache_func(
70+
q: Tensor,
71+
k_cache: Tensor,
72+
v_cache: Tensor,
73+
seqlens_k: Tensor,
74+
block_table: Tensor,
75+
k: Optional[Tensor] = None,
76+
v: Optional[Tensor] = None,
77+
rotary_cos: Optional[Tensor] = None,
78+
rotary_sin: Optional[Tensor] = None,
79+
cache_batch_idx: Optional[Tensor] = None,
80+
causal: bool = True,
81+
is_rotary_interleaved: bool = False,
82+
num_splits: int = 1,
83+
dropout: float = 0.0,
84+
return_softmax: bool = False,
85+
) -> Tuple[Tensor, Tensor]:
86+
out, softmax_lse = paddle._C_ops._run_custom_op(
87+
"flash_attn_kvcache",
88+
q,
89+
k_cache,
90+
v_cache,
91+
k,
92+
v,
93+
seqlens_k,
94+
rotary_cos,
95+
rotary_sin,
96+
cache_batch_idx,
97+
block_table,
98+
causal,
99+
is_rotary_interleaved,
100+
num_splits,
101+
dropout,
102+
return_softmax,
103+
)
104+
return out, softmax_lse

0 commit comments

Comments
 (0)