1
1
## Supported Models
2
2
| Model Name| Context Length| Quantization| XPUs Required| Deployment Commands| Minimum Version Required|
3
3
| -| -| -| -| -| -|
4
- | ERNIE-4.5-300B-A47B| 32K| WINT8| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
5
- | ERNIE-4.5-300B-A47B| 32K| WINT4| 4 (Recommended)| export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 4 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.0.0|
6
- | ERNIE-4.5-300B-A47B| 32K| WINT4| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.95| >=2.0.0|
7
- | ERNIE-4.5-300B-A47B| 128K| WINT4| 8 (Recommended)| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.0.0|
8
- | ERNIE-4.5-21B-A3B| 32K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
9
- | ERNIE-4.5-21B-A3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
10
- | ERNIE-4.5-21B-A3B| 32K| WINT4| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
11
- | ERNIE-4.5-21B-A3B| 128K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
12
- | ERNIE-4.5-21B-A3B| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
13
- | ERNIE-4.5-21B-A3B| 128K| WINT4| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9| >=2.1.0|
14
- | ERNIE-4.5-0.3B| 32K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
15
- | ERNIE-4.5-0.3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
16
- | ERNIE-4.5-0.3B| 128K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
17
- | ERNIE-4.5-0.3B| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9| >=2.0.3|
4
+ | ERNIE-4.5-300B-A47B| 32K| WINT8| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.0.3|
5
+ | ERNIE-4.5-300B-A47B| 32K| WINT4| 4 (Recommended)| export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 4 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.0.0|
6
+ | ERNIE-4.5-300B-A47B| 32K| WINT4| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.95 \ < br > --load-choices "default" | >=2.0.0|
7
+ | ERNIE-4.5-300B-A47B| 128K| WINT4| 8 (Recommended)| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.0.0|
8
+ | ERNIE-4.5-21B-A3B| 32K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.1.0|
9
+ | ERNIE-4.5-21B-A3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.1.0|
10
+ | ERNIE-4.5-21B-A3B| 32K| WINT4| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.1.0|
11
+ | ERNIE-4.5-21B-A3B| 128K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.1.0|
12
+ | ERNIE-4.5-21B-A3B| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.1.0|
13
+ | ERNIE-4.5-21B-A3B| 128K| WINT4| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.1.0|
14
+ | ERNIE-4.5-0.3B| 32K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.0.3|
15
+ | ERNIE-4.5-0.3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.0.3|
16
+ | ERNIE-4.5-0.3B| 128K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.0.3|
17
+ | ERNIE-4.5-0.3B| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ < br > --load-choices "default" | >=2.0.3|
18
18
19
19
## Quick start
20
20
@@ -36,7 +36,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
36
36
--max-model-len 32768 \
37
37
--max-num-seqs 64 \
38
38
--quantization " wint4" \
39
- --gpu-memory-utilization 0.9
39
+ --gpu-memory-utilization 0.9 \
40
+ --load-choices " default"
40
41
```
41
42
42
43
** Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities.
0 commit comments