diff --git a/docs/source/user-guide/pd-disaggregation/1p1d.md b/docs/source/user-guide/pd-disaggregation/1p1d.md index c2bdb47e2..25c7a9e23 100644 --- a/docs/source/user-guide/pd-disaggregation/1p1d.md +++ b/docs/source/user-guide/pd-disaggregation/1p1d.md @@ -13,14 +13,12 @@ For illustration purposes, let us take GPU as an example and assume the model us ### Run prefill server Prefiller Launch Command: ```bash -export PYTHONHASHSEED=123456 export CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \ --max-model-len 20000 \ --tensor-parallel-size 1 \ --gpu_memory_utilization 0.87 \ --trust-remote-code \ ---enforce-eager \ --no-enable-prefix-caching \ --port 7800 \ --block-size 128 \ @@ -42,14 +40,12 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \ ### Run decode server Decoder Launch Command: ```bash -export PYTHONHASHSEED=123456 export CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \ --max-model-len 20000 \ --tensor-parallel-size 1 \ --gpu_memory_utilization 0.87 \ --trust-remote-code \ ---enforce-eager \ --no-enable-prefix-caching \ --port 7801 \ --block-size 128 \ diff --git a/docs/source/user-guide/pd-disaggregation/npgd.md b/docs/source/user-guide/pd-disaggregation/npgd.md index 05bbc0823..5c25a19c9 100644 --- a/docs/source/user-guide/pd-disaggregation/npgd.md +++ b/docs/source/user-guide/pd-disaggregation/npgd.md @@ -19,14 +19,12 @@ For illustration purposes, let us assume that the model used is Qwen2.5-7B-Instr ### Run prefill server Prefiller Launch Command: ```bash -export PYTHONHASHSEED=123456 export ASCEND_RT_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \ --max-model-len 20000 \ --tensor-parallel-size 1 \ --gpu_memory_utilization 0.87 \ --trust-remote-code \ ---enforce-eager \ --no-enable-prefix-caching \ --port 7800 \ --block-size 128 \ @@ -49,14 +47,12 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \ ### Run decode server Decoder Launch Command: ```bash -export PYTHONHASHSEED=123456 export CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \ --max-model-len 20000 \ --tensor-parallel-size 1 \ --gpu_memory_utilization 0.87 \ --trust-remote-code \ ---enforce-eager \ --no-enable-prefix-caching \ --port 7801 \ --block-size 128 \ diff --git a/docs/source/user-guide/pd-disaggregation/xpyd.md b/docs/source/user-guide/pd-disaggregation/xpyd.md index b21f19ada..b710d13e0 100644 --- a/docs/source/user-guide/pd-disaggregation/xpyd.md +++ b/docs/source/user-guide/pd-disaggregation/xpyd.md @@ -13,14 +13,12 @@ For illustration purposes, let us take GPU as an example and assume the model us ### Run prefill servers Prefiller1 Launch Command: ```bash -export PYTHONHASHSEED=123456 export CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \ --max-model-len 20000 \ --tensor-parallel-size 1 \ --gpu_memory_utilization 0.87 \ --trust-remote-code \ ---enforce-eager \ --no-enable-prefix-caching \ --port 7800 \ --block-size 128 \ @@ -41,14 +39,12 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \ Prefiller2 Launch Command: ```bash -export PYTHONHASHSEED=123456 export CUDA_VISIBLE_DEVICES=1 vllm serve /home/models/Qwen2.5-7B-Instruct \ --max-model-len 20000 \ --tensor-parallel-size 1 \ --gpu_memory_utilization 0.87 \ --trust-remote-code \ ---enforce-eager \ --no-enable-prefix-caching \ --port 7801 \ --block-size 128 \