ModelTC · shihaobai · Jun 24, 2025 · Jun 24, 2025 · gemini-code-assist · Jun 24, 2025
diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ LightLLM is a Python-based LLM (Large Language Model) inference and serving fram
 
 - [Install LightLLM](https://lightllm-en.readthedocs.io/en/latest/getting_started/installation.html)
 - [Quick Start](https://lightllm-en.readthedocs.io/en/latest/getting_started/quickstart.html)
-- [TuTorial](https://lightllm-en.readthedocs.io/en/latest/tutorial/)
+- [TuTorial](https://lightllm-en.readthedocs.io/en/latest/tutorial/deepseek_deployment.html)
 
 
 ## Performance

diff --git a/docker/Dockerfile.deepep b/docker/Dockerfile.deepep
@@ -17,8 +17,6 @@ RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-ge
     git && \
     rm -rf /var/lib/apt/lists/*
 
-ENV http_proxy=http://devsft:[email protected]:3128
-ENV https_proxy=http://devsft:[email protected]:3128
 RUN case ${TARGETPLATFORM} in \
     "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
     *)              MAMBA_ARCH=x86_64   ;; \
@@ -40,10 +38,9 @@ WORKDIR /root
 COPY ./requirements.txt /lightllm/requirements.txt
 RUN pip install -r /lightllm/requirements.txt --no-cache-dir --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124
 
-RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1  # for allreduce hang issues in multinode H100
+RUN pip install --no-cache-dir https://github.com/ModelTC/flash-attn-3-build/releases/download/v2.7.4.post1/flash_attn-3.0.0b1-cp39-cp39-linux_x86_64.whl
 
-RUN git clone https://github.com/Dao-AILab/flash-attention.git -b v2.7.4.post1
-RUN cd flash-attention/hopper && FLASH_ATTN_CUDA_ARCHS=90 NVCC_THREADS=128 python setup.py install
+RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1  # for allreduce hang issues in multinode H100
 
 RUN git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git
 RUN cd DeepGEMM && python setup.py install

diff --git a/docs/CN/source/getting_started/installation.rst b/docs/CN/source/getting_started/installation.rst
@@ -74,8 +74,6 @@ Lightllm 是一个纯python开发的推理框架，其中的算子使用triton
     $ # 安装lightllm
     $ python setup.py install
 
-NOTE: 如果您出于一些原因使用了cuda 11.x的torch, 请运行 `pip install nvidia-nccl-cu12==2.20.5` 以支持 torch cuda graph.
-
 .. note::
 
     Lightllm 的代码在多种GPU上都进行了测试，包括 V100, A100, A800, 4090, 和 H800。

diff --git a/docs/CN/source/tutorial/deepseek_deployment.rst b/docs/CN/source/tutorial/deepseek_deployment.rst
@@ -199,7 +199,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --disable_cudagraph \
     --pd_master_ip $pd_master_ip \
     --pd_master_port 60011
-    # if you want to enable microbatch overlap, you can uncomment the following lines
+    # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_prefill_microbatch_overlap
 
 **步骤 3: 启动 Decode 服务**
@@ -223,7 +223,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --disable_cudagraph \
     --pd_master_ip $pd_master_ip \
     --pd_master_port 60011
-    # if you want to enable microbatch overlap, you can uncomment the following lines
+    # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_decode_microbatch_overlap
 
 3.2 多 PD Master 模式
@@ -291,7 +291,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --disable_cudagraph \
     --config_server_host $config_server_host \
     --config_server_port 60088
-    # if you want to enable microbatch overlap, you can uncomment the following lines
+    # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_prefill_microbatch_overlap
 
     # Decode 服务
@@ -309,7 +309,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --enable_fa3 \
     --config_server_host $config_server_host \
     --config_server_port 60088
-    # if you want to enable microbatch overlap, you can uncomment the following lines
+    # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_decode_microbatch_overlap
 
 4. 测试和验证

diff --git a/docs/EN/.readthedocs.yaml b/docs/EN/.readthedocs.yaml
diff --git a/docs/EN/source/framework/framework.rst b/docs/EN/source/framework/framework.rst
diff --git a/docs/EN/source/framework/router.rst b/docs/EN/source/framework/router.rst
diff --git a/docs/EN/source/framework/token_attention.rst b/docs/EN/source/framework/token_attention.rst
diff --git a/docs/EN/source/getting_started/benchmark.rst b/docs/EN/source/getting_started/benchmark.rst
diff --git a/docs/EN/source/getting_started/installation.rst b/docs/EN/source/getting_started/installation.rst
@@ -74,8 +74,6 @@ You can also install Lightllm from source:
     $ # Install Lightllm
     $ python setup.py install
 
-NOTE: If you use torch with cuda 11.x for some reason, please run `pip install nvidia-nccl-cu12==2.20.5` to support torch cuda graph.
-
 .. note::
 
     Lightllm code has been tested on various GPUs including V100, A100, A800, 4090, and H800.

diff --git a/docs/EN/source/tutorial/api_param.rst b/docs/EN/source/tutorial/api_param.rst