add docs.

hiworldwzj · hiworldwzj · commit 9560b20a6aec · 2025-04-16T12:04:00.000+08:00
diff --git a/docs/CN/source/getting_started/quickstart.rst b/docs/CN/source/getting_started/quickstart.rst
@@ -93,7 +93,7 @@
 
 .. code-block:: console
 
-    $ CUDA_VISIBLE_DEVICES=0  python -m lightllm.server.api_server \
+    $ python -m lightllm.server.api_server \
     $ --model_dir /your/model/path \
     $ --run_mode "pd_master" \
     $ --host /your/host/ip \
@@ -165,3 +165,107 @@
     $ cd test
     $ python benchmark_client.py --num_clients 100 --input_num 2000 --tokenizer_path /nvme/DeepSeek-R1/ --url http://127.0.01:8000/generate_stream
 
+
+3. PD 分离多PD_Master节点类型启动模型服务
+-------------------------
+查找本机IP
+
+.. code-block:: console
+
+    $ hostname -i
+
+运行MPS(可选, 有mps支持性能会好特别多，但是部分显卡和驱动环境开启mps会容易出现错误，建议升级驱动到较高版本，特别是H系列卡)
+
+.. code-block:: console
+
+    $ nvidia-cuda-mps-control -d 
+
+
+运行config_server服务
+.. code-block:: console
+
+$ python -m lightllm.server.api_server \
+$ --run_mode "config_server" \
+$ --config_server_host /your/host/ip \
+$ --config_server_port 60088 \
+
+
+运行pd_master服务, 在多pd_master节点模式下，可以开启多个pd_master服务，来实现负载均衡，单个pd_master因为python gil锁的原因
+其并发性能存在上限。
+
+.. code-block:: console
+
+    $ python -m lightllm.server.api_server \
+    $ --model_dir /your/model/path \
+    $ --run_mode "pd_master" \
+    $ --host /your/host/ip \
+    $ --port 60011 \
+    $ --config_server_host <config_server_host> \
+    $ --config_server_port <config_server_port>
+
+新建终端,运行prefill服务 
+
+.. code-block:: console
+
+    $ CUDA_VISIBLE_DEVICES=0,1 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /data/fengdahu/model/Qwen2-7B/ \
+    $ --run_mode "prefill" \
+    $ --host /your/host/ip \
+    $ --port 8017 \
+    $ --tp 2 \
+    $ --nccl_port 2732 \
+    $ --max_total_token_num 400000 \
+    $ --tokenizer_mode fast \
+    $ --use_dynamic_prompt_cache \
+    $ --max_req_total_len 16000 \
+    $ --running_max_req_size 128 \
+    $ --disable_cudagraph \
+    $ --config_server_host <config_server_host> \
+    $ --config_server_port <config_server_port>
+
+新建终端,运行decoding服务
+
+.. code-block:: console
+
+    $ CUDA_VISIBLE_DEVICES=2,3 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /data/fengdahu/model/Qwen2-7B/ \
+    $ --run_mode "decode" \
+    $ --host /your/host/ip \
+    $ --port 8118 \
+    $ --nccl_port 12322 \
+    $ --tp 2 \
+    $ --max_total_token_num 400000 \
+    $ --graph_max_len_in_batch 2048 \
+    $ --graph_max_batch_size 16 \
+    $ --tokenizer_mode fast \
+    $ --use_dynamic_prompt_cache \
+    $ --config_server_host <config_server_host> \
+    $ --config_server_port <config_server_port>
+
+.. note::
+    prefill和decoding阶段的tp大小保持一致, 目前可以支持 prefill 和 decode 节点的数量是变化的，同时prefill 和 decode可以跨机部署。
+
+
+4. （可选）测试模型服务
+-------------------------
+
+在新的终端，使用下面的指令对模型服务进行测试, 在多pd_master模式下，每个pd_master都可以作为访问入口：
+
+.. code-block:: console
+
+    $ curl http://server_ip:server_port/generate \
+    $      -H "Content-Type: application/json" \
+    $      -d '{
+    $            "inputs": "What is AI?",
+    $            "parameters":{
+    $              "max_new_tokens":17, 
+    $              "frequency_penalty":1
+    $            }
+    $           }'
+
+
+对于DeepSeek-R1模型，可以用如下脚本进行测试：
+
+.. code-block:: console
+
+    $ cd test
+    $ python benchmark_client.py --num_clients 100 --input_num 2000 --tokenizer_path /nvme/DeepSeek-R1/ --url http://127.0.01:8000/generate_stream
+
diff --git a/test/test.sh b/test/test.sh
@@ -71,3 +71,43 @@ CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.serve
 LOADWORKER=8 python -m lightllm.server.api_server --port 8018 --model_dir /dev/shm/llama2-7b-chat --tp 2 --graph_max_batch_size 16 --use_dynamic_prompt_cache
 
 
+# 多 pd_master 节点部署实列
+python -m lightllm.server.api_server --run_mode "config_server" --config_server_host 10.120.114.74 --config_server_port 60088
+
+python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60011 --config_server_host 10.120.114.74 --config_server_port 60088
+
+python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60012 --config_server_host 10.120.114.74 --config_server_port 60088
+
+
+nvidia-cuda-mps-control -d 
+CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \
+--run_mode "prefill" \
+--host 10.120.178.74 \
+--port 8019 \
+--tp 1 \
+--nccl_port 2732 \
+--max_total_token_num 40000 \
+--tokenizer_mode fast \
+--use_dynamic_prompt_cache \
+--max_req_total_len 16000 \
+--running_max_req_size 128 \
+--disable_cudagraph \
+--config_server_host 10.120.114.74 \
+--config_server_port 60088
+
+CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \
+--run_mode "decode" \
+--host 10.120.178.74 \
+--port 8121 \
+--nccl_port 12322 \
+--tp 1 \
+--max_total_token_num 40000 \
+--graph_max_len_in_batch 2048 \
+--graph_max_batch_size 16 \
+--tokenizer_mode fast \
+--use_dynamic_prompt_cache \
+--config_server_host 10.120.114.74 \
+--config_server_port 60088
+
+
+