1
1
## Supported Models
2
2
| Model Name| Context Length| Quantization| XPUs Required| Deployment Commands| Minimum Version Required|
3
3
| -| -| -| -| -| -|
4
- | ERNIE-4.5-300B-A47B| 32K| WINT8| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
5
- | ERNIE-4.5-300B-A47B| 32K| WINT4| 4 (Recommended)| export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 4 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.0.0|
6
- | ERNIE-4.5-300B-A47B| 32K| WINT4| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.95| >=2.0.0|
7
- | ERNIE-4.5-300B-A47B| 128K| WINT4| 8 (Recommended)| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.0.0|
8
- | ERNIE-4.5-21B-A3B| 32K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
9
- | ERNIE-4.5-21B-A3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
10
- | ERNIE-4.5-21B-A3B| 32K| WINT4| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
11
- | ERNIE-4.5-21B-A3B| 128K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
12
- | ERNIE-4.5-21B-A3B| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
13
- | ERNIE-4.5-21B-A3B| 128K| WINT4| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
14
- | ERNIE-4.5-0.3B| 32K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
15
- | ERNIE-4.5-0.3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
16
- | ERNIE-4.5-0.3B| 128K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
17
- | ERNIE-4.5-0.3B| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
4
+ | ERNIE-4.5-300B-A47B| 32K| WINT8| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
5
+ | ERNIE-4.5-300B-A47B| 32K| WINT4| 4 (Recommended)| export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 4 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.0.0|
6
+ | ERNIE-4.5-300B-A47B| 32K| WINT4| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.95| >=2.0.0|
7
+ | ERNIE-4.5-300B-A47B| 128K| WINT4| 8 (Recommended)| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.0.0|
8
+ | ERNIE-4.5-21B-A3B| 32K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
9
+ | ERNIE-4.5-21B-A3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
10
+ | ERNIE-4.5-21B-A3B| 32K| WINT4| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
11
+ | ERNIE-4.5-21B-A3B| 128K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
12
+ | ERNIE-4.5-21B-A3B| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
13
+ | ERNIE-4.5-21B-A3B| 128K| WINT4| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
14
+ | ERNIE-4.5-0.3B| 32K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
15
+ | ERNIE-4.5-0.3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
16
+ | ERNIE-4.5-0.3B| 128K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
17
+ | ERNIE-4.5-0.3B| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported< br > python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
18
18
19
19
## Quick start
20
20
@@ -28,6 +28,7 @@ Deploy an OpenAI API-compatible server using FastDeploy with the following comma
28
28
29
29
``` bash
30
30
export XPU_VISIBLE_DEVICES=" 0,1,2,3" # Specify which cards to be used
31
+ export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported
31
32
python -m fastdeploy.entrypoints.openai.api_server \
32
33
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
33
34
--port 8188 \
0 commit comments