diff --git a/README.md b/README.md
index 918aa3874..dbffa1288 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ LightLLM is a Python-based LLM (Large Language Model) inference and serving fram
 
 - [Install LightLLM](https://lightllm-en.readthedocs.io/en/latest/getting_started/installation.html)
 - [Quick Start](https://lightllm-en.readthedocs.io/en/latest/getting_started/quickstart.html)
-- [TuTorial](https://lightllm-en.readthedocs.io/en/latest/tutorial/)
+- [TuTorial](https://lightllm-en.readthedocs.io/en/latest/tutorial/deepseek_deployment.html)
 
 
 ## Performance
diff --git a/docker/Dockerfile.deepep b/docker/Dockerfile.deepep
index ce47ea5af..fef3b757a 100644
--- a/docker/Dockerfile.deepep
+++ b/docker/Dockerfile.deepep
@@ -17,8 +17,6 @@ RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-ge
     git && \
     rm -rf /var/lib/apt/lists/*
 
-ENV http_proxy=http://devsft:d0663c03baee@10.119.176.202:3128
-ENV https_proxy=http://devsft:d0663c03baee@10.119.176.202:3128
 RUN case ${TARGETPLATFORM} in \
     "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
     *)              MAMBA_ARCH=x86_64   ;; \
@@ -40,10 +38,9 @@ WORKDIR /root
 COPY ./requirements.txt /lightllm/requirements.txt
 RUN pip install -r /lightllm/requirements.txt --no-cache-dir --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124
 
-RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1  # for allreduce hang issues in multinode H100
+RUN pip install --no-cache-dir https://github.com/ModelTC/flash-attn-3-build/releases/download/v2.7.4.post1/flash_attn-3.0.0b1-cp39-cp39-linux_x86_64.whl
 
-RUN git clone https://github.com/Dao-AILab/flash-attention.git -b v2.7.4.post1
-RUN cd flash-attention/hopper && FLASH_ATTN_CUDA_ARCHS=90 NVCC_THREADS=128 python setup.py install
+RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1  # for allreduce hang issues in multinode H100
 
 RUN git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git
 RUN cd DeepGEMM && python setup.py install
diff --git a/docs/CN/source/getting_started/installation.rst b/docs/CN/source/getting_started/installation.rst
index cbf5dc339..6d8732321 100755
--- a/docs/CN/source/getting_started/installation.rst
+++ b/docs/CN/source/getting_started/installation.rst
@@ -74,8 +74,6 @@ Lightllm 是一个纯python开发的推理框架，其中的算子使用triton
     $ # 安装lightllm
     $ python setup.py install
 
-NOTE: 如果您出于一些原因使用了cuda 11.x的torch, 请运行 `pip install nvidia-nccl-cu12==2.20.5` 以支持 torch cuda graph.
-
 .. note::
 
     Lightllm 的代码在多种GPU上都进行了测试，包括 V100, A100, A800, 4090, 和 H800。
diff --git a/docs/CN/source/tutorial/deepseek_deployment.rst b/docs/CN/source/tutorial/deepseek_deployment.rst
index f017ee1f7..f59549fb3 100644
--- a/docs/CN/source/tutorial/deepseek_deployment.rst
+++ b/docs/CN/source/tutorial/deepseek_deployment.rst
@@ -199,7 +199,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --disable_cudagraph \
     --pd_master_ip $pd_master_ip \
     --pd_master_port 60011
-    # if you want to enable microbatch overlap, you can uncomment the following lines
+    # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_prefill_microbatch_overlap
 
 **步骤 3: 启动 Decode 服务**
@@ -223,7 +223,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --disable_cudagraph \
     --pd_master_ip $pd_master_ip \
     --pd_master_port 60011
-    # if you want to enable microbatch overlap, you can uncomment the following lines
+    # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_decode_microbatch_overlap
 
 3.2 多 PD Master 模式
@@ -291,7 +291,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --disable_cudagraph \
     --config_server_host $config_server_host \
     --config_server_port 60088
-    # if you want to enable microbatch overlap, you can uncomment the following lines
+    # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_prefill_microbatch_overlap
 
     # Decode 服务
@@ -309,7 +309,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --enable_fa3 \
     --config_server_host $config_server_host \
     --config_server_port 60088
-    # if you want to enable microbatch overlap, you can uncomment the following lines
+    # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_decode_microbatch_overlap
 
 4. 测试和验证
diff --git a/docs/EN/.readthedocs.yaml b/docs/EN/.readthedocs.yaml
old mode 100644
new mode 100755
diff --git a/docs/EN/source/framework/framework.rst b/docs/EN/source/framework/framework.rst
old mode 100644
new mode 100755
diff --git a/docs/EN/source/framework/router.rst b/docs/EN/source/framework/router.rst
old mode 100644
new mode 100755
diff --git a/docs/EN/source/framework/token_attention.rst b/docs/EN/source/framework/token_attention.rst
old mode 100644
new mode 100755
diff --git a/docs/EN/source/getting_started/benchmark.rst b/docs/EN/source/getting_started/benchmark.rst
old mode 100644
new mode 100755
diff --git a/docs/EN/source/getting_started/installation.rst b/docs/EN/source/getting_started/installation.rst
index 35b398287..37dad77b6 100755
--- a/docs/EN/source/getting_started/installation.rst
+++ b/docs/EN/source/getting_started/installation.rst
@@ -74,8 +74,6 @@ You can also install Lightllm from source:
     $ # Install Lightllm
     $ python setup.py install
 
-NOTE: If you use torch with cuda 11.x for some reason, please run `pip install nvidia-nccl-cu12==2.20.5` to support torch cuda graph.
-
 .. note::
 
     Lightllm code has been tested on various GPUs including V100, A100, A800, 4090, and H800.
diff --git a/docs/EN/source/tutorial/api_param.rst b/docs/EN/source/tutorial/api_param.rst
old mode 100644
new mode 100755
diff --git a/docs/EN/source/tutorial/api_server_args_zh.rst b/docs/EN/source/tutorial/api_server_args_zh.rst
old mode 100644
new mode 100755
index a409777e8..3b25ae85c
--- a/docs/EN/source/tutorial/api_server_args_zh.rst
+++ b/docs/EN/source/tutorial/api_server_args_zh.rst
@@ -11,10 +11,10 @@ Basic Configuration Parameters
     Set the running mode, optional values:
     
     * ``normal``: Single server mode (default)
-    * ``prefill``: Prefill mode (for pd separation running mode)
-    * ``decode``: Decode mode (for pd separation running mode)
-    * ``pd_master``: pd master node mode (for pd separation running mode)
-    * ``config_server``: Configuration server mode (for pd separation mode, used to register pd_master nodes and get pd_master node list), specifically designed for large-scale, high-concurrency scenarios, used when `pd_master` encounters significant CPU bottlenecks.
+    * ``prefill``: Prefill mode (for pd disaggregation running mode)
+    * ``decode``: Decode mode (for pd disaggregation running mode)
+    * ``pd_master``: pd master node mode (for pd disaggregation running mode)
+    * ``config_server``: Configuration server mode (for pd disaggregation mode, used to register pd_master nodes and get pd_master node list), specifically designed for large-scale, high-concurrency scenarios, used when `pd_master` encounters significant CPU bottlenecks.
 
 .. option:: --host
 
@@ -37,7 +37,7 @@ Basic Configuration Parameters
     
     Can only choose from ``['tcp://', 'ipc:///tmp/']``
 
-PD Separation Mode Parameters
+PD disaggregation Mode Parameters
 ----------------------------
 
 .. option:: --pd_master_ip
@@ -100,7 +100,7 @@ Memory and Batch Processing Parameters
 
 .. option:: --max_total_token_num
 
-    Total token count supported by GPU and model, equals max_batch * (input_len + output_len)
+    Total token number of kv cache. 
     
     If not specified, will be automatically calculated based on mem_fraction
 
@@ -196,4 +196,313 @@ Attention Type Selection Parameters
     * ``triton_gqa_flashdecoding``: Fast flashdecoding kernel for models using GQA
     * ``triton_fp8kv``: Use float8 to store kv cache, currently only used for deepseek2
     
-    Need to read source code to confirm specific modes supported by all models 
\ No newline at end of file
+    Need to read source code to confirm specific modes supported by all models 
+
+Scheduling Parameters
+--------------------
+
+.. option:: --router_token_ratio
+
+    Threshold for determining if the service is busy, default is ``0.0``. Once the kv cache usage exceeds this value, it will directly switch to conservative scheduling.
+
+.. option:: --router_max_new_token_len
+
+    The request output length used by the scheduler when evaluating request kv usage, default is ``1024``, generally lower than the max_new_tokens set by the user. This parameter only takes effect when --router_token_ratio is greater than 0.
+    Setting this parameter will make request scheduling more aggressive, allowing the system to process more requests simultaneously, but will inevitably cause request pause and recalculation.
+
+.. option:: --router_max_wait_tokens
+
+    Trigger scheduling of new requests every router_max_wait_tokens decoding steps, default is ``6``
+
+.. option:: --disable_aggressive_schedule
+
+    Disable aggressive scheduling
+    
+    Aggressive scheduling may cause frequent prefill interruptions during decoding. Disabling it can make the router_max_wait_tokens parameter work more effectively.
+
+.. option:: --disable_dynamic_prompt_cache
+
+    Disable kv cache caching
+
+.. option:: --chunked_prefill_size
+
+    Chunked prefill size, default is ``4096``
+
+.. option:: --disable_chunked_prefill
+
+    Whether to disable chunked prefill
+
+.. option:: --diverse_mode
+
+    Multi-result output mode
+
+Output Constraint Parameters
+---------------------------
+
+.. option:: --token_healing_mode
+
+.. option:: --output_constraint_mode
+
+    Set the output constraint backend, optional values:
+    
+    * ``outlines``: Use outlines backend
+    * ``xgrammar``: Use xgrammar backend
+    * ``none``: No output constraint (default)
+
+.. option:: --first_token_constraint_mode
+
+    Constrain the allowed range of the first token
+    Use environment variable FIRST_ALLOWED_TOKENS to set the range, e.g., FIRST_ALLOWED_TOKENS=1,2
+
+Multimodal Parameters
+--------------------
+
+.. option:: --enable_multimodal
+
+    Whether to allow loading additional visual models
+
+.. option:: --enable_multimodal_audio
+
+    Whether to allow loading additional audio models (requires --enable_multimodal)
+
+.. option:: --enable_mps
+
+    Whether to enable nvidia mps for multimodal services
+
+.. option:: --cache_capacity
+
+    Cache server capacity for multimodal resources, default is ``200``
+
+.. option:: --cache_reserved_ratio
+
+    Reserved capacity ratio after cache server cleanup, default is ``0.5``
+
+.. option:: --visual_infer_batch_size
+
+    Number of images processed in each inference batch, default is ``1``
+
+.. option:: --visual_gpu_ids
+
+    List of GPU IDs to use, e.g., 0 1 2
+
+.. option:: --visual_tp
+
+    Number of tensor parallel instances for ViT, default is ``1``
+
+.. option:: --visual_dp
+
+    Number of data parallel instances for ViT, default is ``1``
+
+.. option:: --visual_nccl_ports
+
+    List of NCCL ports for ViT, e.g., 29500 29501 29502, default is [29500]
+
+Performance Optimization Parameters
+----------------------------------
+
+.. option:: --disable_custom_allreduce
+
+    Whether to disable custom allreduce
+
+.. option:: --enable_custom_allgather
+
+    Whether to enable custom allgather
+
+.. option:: --enable_tpsp_mix_mode
+
+    The inference backend will use TP SP mixed running mode
+    
+    Currently only supports llama and deepseek series models
+
+.. option:: --enable_prefill_microbatch_overlap
+
+    The inference backend will use microbatch overlap mode for prefill
+    
+    Currently only supports deepseek series models
+
+.. option:: --enable_decode_microbatch_overlap
+
+    The inference backend will use microbatch overlap mode for decoding
+    
+.. option:: --enable_flashinfer_prefill
+
+    The inference backend will use flashinfer's attention kernel for prefill
+    
+.. option:: --enable_flashinfer_decode
+
+    The inference backend will use flashinfer's attention kernel for decoding
+    
+.. option:: --enable_fa3
+
+    The inference backend will use fa3 attention kernel for prefill and decoding
+
+.. option:: --disable_cudagraph
+
+    Disable cudagraph in the decoding phase
+
+.. option:: --graph_max_batch_size
+
+    Maximum batch size that can be captured by cuda graph in the decoding phase, default is ``256``
+
+.. option:: --graph_split_batch_size
+
+    Controls the interval for generating CUDA graphs during decoding, default is ``32``
+    
+    For values from 1 to the specified graph_split_batch_size, CUDA graphs will be generated continuously.
+    For values from graph_split_batch_size to graph_max_batch_size,
+    a new CUDA graph will be generated for every increase of graph_grow_step_size.
+    Properly configuring this parameter can help optimize the performance of CUDA graph execution.
+
+.. option:: --graph_grow_step_size
+
+    For batch_size values from graph_split_batch_size to graph_max_batch_size,
+    a new CUDA graph will be generated for every increase of graph_grow_step_size, default is ``16``
+
+.. option:: --graph_max_len_in_batch
+
+    Maximum sequence length that can be captured by cuda graph in the decoding phase, default is ``max_req_total_len``
+
+Quantization Parameters
+----------------------
+
+.. option:: --quant_type
+
+    Quantization method, optional values:
+    
+    * ``ppl-w4a16-128``
+    * ``flashllm-w6a16``
+    * ``ao-int4wo-[32,64,128,256]``
+    * ``ao-int8wo``
+    * ``ao-fp8w8a16``
+    * ``ao-fp6w6a16``
+    * ``vllm-w8a8``
+    * ``vllm-fp8w8a8``
+    * ``vllm-fp8w8a8-b128``
+    * ``triton-fp8w8a8-block128``
+    * ``none`` (default)
+
+.. option:: --quant_cfg
+
+    Path to quantization configuration file. Can be used for mixed quantization.
+    
+    Examples can be found in test/advanced_config/mixed_quantization/llamacls-mix-down.yaml.
+
+.. option:: --vit_quant_type
+
+    ViT quantization method, optional values:
+    
+    * ``ppl-w4a16-128``
+    * ``flashllm-w6a16``
+    * ``ao-int4wo-[32,64,128,256]``
+    * ``ao-int8wo``
+    * ``ao-fp8w8a16``
+    * ``ao-fp6w6a16``
+    * ``vllm-w8a8``
+    * ``vllm-fp8w8a8``
+    * ``none`` (default)
+
+.. option:: --vit_quant_cfg
+
+    Path to ViT quantization configuration file. Can be used for mixed quantization.
+    
+    Examples can be found in lightllm/common/quantization/configs.
+
+Sampling and Generation Parameters
+--------------------------------
+
+.. option:: --sampling_backend
+
+    Implementation used for sampling, optional values:
+    
+    * ``triton``: Use torch and triton kernel (default)
+    * ``sglang_kernel``: Use sglang_kernel implementation
+
+.. option:: --return_all_prompt_logprobs
+
+    Return logprobs for all prompt tokens
+
+.. option:: --use_reward_model
+
+    Use reward model
+
+.. option:: --long_truncation_mode
+
+    How to handle when input_token_len + max_new_tokens > max_req_total_len, optional values:
+    
+    * ``None``: Throw exception (default)
+    * ``head``: Remove some head tokens to make input_token_len + max_new_tokens <= max_req_total_len
+    * ``center``: Remove some tokens at the center position to make input_token_len + max_new_tokens <= max_req_total_len
+
+.. option:: --use_tgi_api
+
+    Use tgi input and output format
+
+MTP Multi-Prediction Parameters
+------------------------------
+
+.. option:: --mtp_mode
+
+    Supported mtp modes, optional values:
+    
+    * ``deepseekv3``
+    * ``None``: Do not enable mtp (default)
+
+.. option:: --mtp_draft_model_dir
+
+    Path to the draft model for MTP multi-prediction functionality
+    
+    Used to load the MTP multi-output token model.
+
+.. option:: --mtp_step
+
+    Specify the number of additional tokens predicted by the draft model, default is ``0``
+    
+    Currently this feature only supports DeepSeekV3/R1 models.
+    Increasing this value allows more predictions, but ensure the model is compatible with the specified number of steps.
+    Currently deepseekv3/r1 models only support 1 step
+
+DeepSeek Redundant Expert Parameters
+-----------------------------------
+
+.. option:: --ep_redundancy_expert_config_path
+
+    Path to redundant expert configuration. Can be used for deepseekv3 models.
+
+.. option:: --auto_update_redundancy_expert
+
+    Whether to update redundant experts for deepseekv3 models through online expert usage counters.
+
+Monitoring and Logging Parameters
+--------------------------------
+
+.. option:: --disable_log_stats
+
+    Disable throughput statistics logging
+
+.. option:: --log_stats_interval
+
+    Interval for recording statistics (seconds), default is ``10``
+
+.. option:: --health_monitor
+
+    Check service health status and restart on error
+
+.. option:: --metric_gateway
+
+    Address for collecting monitoring metrics
+
+.. option:: --job_name
+
+    Job name for monitoring, default is ``lightllm``
+
+.. option:: --grouping_key
+
+    Grouping key for monitoring, format is key=value, can specify multiple
+
+.. option:: --push_interval
+
+    Interval for pushing monitoring metrics (seconds), default is ``10``
+
+.. option:: --enable_monitor_auth
+
+    Whether to enable authentication for push_gateway 
\ No newline at end of file
diff --git a/docs/EN/source/tutorial/deepseek_deployment.rst b/docs/EN/source/tutorial/deepseek_deployment.rst
old mode 100644
new mode 100755
index 35f54ea1a..9e2624bb8
--- a/docs/EN/source/tutorial/deepseek_deployment.rst
+++ b/docs/EN/source/tutorial/deepseek_deployment.rst
@@ -10,26 +10,26 @@ Deployment Mode Overview
 
 LightLLM supports the following deployment modes:
 
-1. **Single Machine TP Mode**: Deploy using tensor parallelism on a single machine
-2. **Single Machine EP Mode**: Deploy using expert parallelism on a single machine
-3. **Multi-Machine TP Mode**: Use tensor parallelism across multiple machines
-4. **Multi-Machine EP Mode**: Use expert parallelism across multiple machines
-5. **PD Separation Mode**: Separate prefill and decode deployment
+1. **Single node TP Mode**: Deploy using tensor parallelism on a single node
+2. **Single node EP Mode**: Deploy using expert parallelism on a single node
+3. **Multi-node TP Mode**: Use tensor parallelism across multiple nodes
+4. **Multi-node EP Mode**: Use expert parallelism across multiple nodes
+5. **PD disaggregation Mode**: Separate prefill and decode deployment
 6. **Multi PD Master Mode**: Support multiple PD Master nodes
 
-1. Single Machine Deployment Solutions
+1. Single node Deployment Solutions
 -------------------------------------
 
-1.1 Single Machine TP Mode (Tensor Parallel)
+1.1 Single node TP Mode (Tensor Parallel)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Suitable for deploying DeepSeek-R1 model on a single H200 machine.
+Suitable for deploying DeepSeek-R1 model on a single H200 node.
 
 **Launch Command:**
 
 .. code-block:: bash
 
-    # H200 Single Machine DeepSeek-R1 TP Mode
+    # H200 Single node DeepSeek-R1 TP Mode
     LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 8 \
@@ -37,11 +37,11 @@ Suitable for deploying DeepSeek-R1 model on a single H200 machine.
 
 **Parameter Description:**
 - `LOADWORKER=18`: Model loading thread count, improves loading speed
-- `--tp 8`: Tensor parallelism degree, using 8 GPUs
+- `--tp 8`: Tensor parallelism, using 8 GPUs
 - `--enable_fa3`: Enable Flash Attention 3.0
 - `--port 8088`: Service port
 
-1.2 Single Machine DP + EP Mode (Data Parallel + Expert Parallel)
+1.2 Single node DP + EP Mode (Data Parallel + Expert Parallel)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Suitable for expert parallelism deployment of MoE models like DeepSeek-V2/V3.
@@ -50,7 +50,7 @@ Suitable for expert parallelism deployment of MoE models like DeepSeek-V2/V3.
 
 .. code-block:: bash
 
-    # H200 Single Machine DeepSeek-R1 DP + EP Mode
+    # H200 Single node DeepSeek-R1 DP + EP Mode
     MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 8 \
@@ -59,27 +59,27 @@ Suitable for expert parallelism deployment of MoE models like DeepSeek-V2/V3.
 
 **Parameter Description:**
 - `MOE_MODE=EP`: Set expert parallelism mode
-- `--tp 8`: Tensor parallelism degree
-- `--dp 8`: Data parallelism degree, usually set to the same value as tp
+- `--tp 8`: Tensor parallelism
+- `--dp 8`: Data parallelism, usually set to the same value as tp
 - `--enable_fa3`: Enable Flash Attention 3.0
 
 **Optional Optimization Parameters:**
 - `--enable_prefill_microbatch_overlap`: Enable prefill microbatch overlap
 - `--enable_decode_microbatch_overlap`: Enable decode microbatch overlap
 
-2. Multi-Machine Deployment Solutions
+2. Multi-node Deployment Solutions
 ------------------------------------
 
-2.1 Multi-Machine TP Mode
+2.1 Multi-node TP Mode
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Suitable for deployment across multiple H200/H100 machines.
+Suitable for deployment across multiple H200/H100 nodes.
 
 **Node 0 Launch Command:**
 
 .. code-block:: bash
 
-    # H200/H100 Multi-Machine DeepSeek-R1 TP Mode Node 0
+    # H200/H100 Multi-node DeepSeek-R1 TP Mode Node 0
     # Usage: sh multi_node_tp_node0.sh <nccl_host>
     export nccl_host=$1
     LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
@@ -95,7 +95,7 @@ Suitable for deployment across multiple H200/H100 machines.
 
 .. code-block:: bash
 
-    # H200/H100 Multi-Machine DeepSeek-R1 TP Mode Node 1
+    # H200/H100 Multi-node DeepSeek-R1 TP Mode Node 1
     # Usage: sh multi_node_tp_node1.sh <nccl_host>
     export nccl_host=$1
     LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
@@ -113,16 +113,16 @@ Suitable for deployment across multiple H200/H100 machines.
 - `--nccl_host`: NCCL communication host address
 - `--nccl_port 2732`: NCCL communication port
 
-2.2 Multi-Machine EP Mode
+2.2 Multi-node EP Mode
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Suitable for deploying MoE models across multiple machines.
+Suitable for deploying MoE models across multiple nodes.
 
 **Node 0 Launch Command:**
 
 .. code-block:: bash
 
-    # H200 Multi-Machine DeepSeek-R1 EP Mode Node 0
+    # H200 Multi-node DeepSeek-R1 EP Mode Node 0
     # Usage: sh multi_node_ep_node0.sh <nccl_host>
     export nccl_host=$1
     MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
@@ -139,7 +139,7 @@ Suitable for deploying MoE models across multiple machines.
 
 .. code-block:: bash
 
-    # H200 Multi-Machine DeepSeek-R1 EP Mode Node 1
+    # H200 Multi-node DeepSeek-R1 EP Mode Node 1
     # Usage: sh multi_node_ep_node1.sh <nccl_host>
     export nccl_host=$1
     MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
@@ -156,10 +156,10 @@ Suitable for deploying MoE models across multiple machines.
 - `--enable_prefill_microbatch_overlap`: Enable prefill microbatch overlap
 - `--enable_decode_microbatch_overlap`: Enable decode microbatch overlap
 
-3. PD Separation Deployment Solutions
+3. PD disaggregation Deployment Solutions
 ------------------------------------
 
-PD (Prefill-Decode) separation mode separates prefill and decode stages for deployment, which can better utilize hardware resources.
+PD (Prefill-Decode) disaggregation mode separates prefill and decode stages for deployment, which can better utilize hardware resources.
 
 3.1 Single PD Master Mode
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -182,7 +182,7 @@ PD (Prefill-Decode) separation mode separates prefill and decode stages for depl
 
     # PD prefill mode for DeepSeek-R1 (DP+EP) on H200
     # Usage: sh pd_prefill.sh <host> <pd_master_ip>
-    # nvidia-cuda-mps-control -d, run MPS (optional, performance will be much better with mps support, but some graphics cards and driver environments may encounter errors when enabling mps, it's recommended to upgrade to a higher driver version, especially for H-series cards)
+    # nvidia-cuda-mps-control -d, run MPS (optional, performance will be much better with mps support, but some GPUs may encounter errors when enabling mps, it's recommended to upgrade to a higher driver version, especially for H-series cards)
 
     export host=$1
     export pd_master_ip=$2
@@ -197,4 +197,147 @@ PD (Prefill-Decode) separation mode separates prefill and decode stages for depl
     --nccl_port 2732 \
     --enable_fa3 \
     --disable_cudagraph \
-    --pd_master_ip $pd_master_ip 
\ No newline at end of file
+    --pd_master_ip $pd_master_ip 
+
+**Step 3: Launch Decode Service**
+
+.. code-block:: bash
+
+    # PD decode mode for DeepSeek-R1 (DP+EP) on H200
+    # Usage: sh pd_decode.sh <host> <pd_master_ip>
+    export host=$1
+    export pd_master_ip=$2
+    nvidia-cuda-mps-control -d
+    MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
+    --model_dir /path/DeepSeek-R1 \
+    --run_mode "decode" \
+    --tp 8 \
+    --dp 8 \
+    --host $host \
+    --port 8121 \
+    --nccl_port 12322 \
+    --enable_fa3 \
+    --disable_cudagraph \
+    --pd_master_ip $pd_master_ip \
+    --pd_master_port 60011
+    # if you want to enable microbatch overlap, you can uncomment the following lines
+    #--enable_decode_microbatch_overlap
+
+3.2 Multi PD Master Mode
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Supports multiple PD Master nodes, providing better load balancing and high availability.
+
+**Step 1: Launch Config Server**
+
+.. code-block:: bash
+
+    # Config Server
+    # Usage: sh config_server.sh <config_server_host>
+    export config_server_host=$1
+    python -m lightllm.server.api_server \
+    --run_mode "config_server" \
+    --config_server_host $config_server_host \
+    --config_server_port 60088
+
+**Step 2: Launch Multiple PD Masters**
+
+.. code-block:: bash
+
+    # PD Master 1
+    # Usage: sh pd_master_1.sh <host> <config_server_host>
+    export host=$1
+    export config_server_host=$2
+    python -m lightllm.server.api_server \
+    --model_dir /path/DeepSeek-R1 \
+    --run_mode "pd_master" \
+    --host $host \
+    --port 60011 \
+    --config_server_host $config_server_host \
+    --config_server_port 60088
+
+    # PD Master 2
+    # Usage: sh pd_master_2.sh <host> <config_server_host>
+    export host=$1
+    export config_server_host=$2
+    python -m lightllm.server.api_server \
+    --model_dir /path/DeepSeek-R1 \
+    --run_mode "pd_master" \
+    --host $host \
+    --port 60012 \
+    --config_server_host $config_server_host \
+    --config_server_port 60088
+
+**Step 3: Launch Prefill and Decode Services**
+
+.. code-block:: bash
+
+    # Prefill Service
+    export host=$1
+    export config_server_host=$2
+    nvidia-cuda-mps-control -d
+    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    --model_dir /path/DeepSeek-R1 \
+    --run_mode "prefill" \
+    --host $host \
+    --port 8019 \
+    --tp 8 \
+    --dp 8 \
+    --nccl_port 2732 \
+    --enable_fa3 \
+    --disable_cudagraph \
+    --config_server_host $config_server_host \
+    --config_server_port 60088
+    # if you want to enable microbatch overlap, you can uncomment the following lines
+    #--enable_prefill_microbatch_overlap
+
+    # Decode Service
+    export host=$1
+    export config_server_host=$2
+    nvidia-cuda-mps-control -d
+    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    --model_dir /path/DeepSeek-R1 \
+    --run_mode "decode" \
+    --host $host \
+    --port 8121 \
+    --nccl_port 12322 \
+    --tp 8 \
+    --dp 8 \
+    --enable_fa3 \
+    --config_server_host $config_server_host \
+    --config_server_port 60088
+    # if you want to enable microbatch overlap, you can uncomment the following lines
+    #--enable_decode_microbatch_overlap
+
+4. Testing and Validation
+-------------------------
+
+4.1 Basic Functionality Testing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+    curl http://server_ip:server_port/generate \
+         -H "Content-Type: application/json" \
+         -d '{
+               "inputs": "What is AI?",
+               "parameters":{
+                 "max_new_tokens":17, 
+                 "frequency_penalty":1
+               }
+              }'
+
+4.2 Performance Benchmark Testing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+    # DeepSeek-R1 Performance Testing
+    cd test
+    python benchmark_client.py \
+    --num_clients 100 \
+    --input_num 2000 \
+    --tokenizer_path /path/DeepSeek-R1/ \
+    --url http://127.0.0.1:8088/generate_stream
+
+All the above scripts can be referenced from the scripts in the `test/start_scripts/multi_pd_master/` directory. 
\ No newline at end of file
diff --git a/docs/EN/source/tutorial/multimodal.rst b/docs/EN/source/tutorial/multimodal.rst
old mode 100644
new mode 100755
diff --git a/docs/EN/source/tutorial/openai.rst b/docs/EN/source/tutorial/openai.rst
old mode 100644
new mode 100755
diff --git a/docs/EN/source/tutorial/reward_model.rst b/docs/EN/source/tutorial/reward_model.rst
old mode 100644
new mode 100755