ModelTC · shihaobai · Jul 23, 2025 · Jul 23, 2025 · gemini-code-assist · Jul 23, 2025
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,10 +1,8 @@
-FROM nvcr.io/nvidia/tritonserver:24.04-py3-min as base
-ARG PYTORCH_VERSION=2.6.0
-ARG PYTHON_VERSION=3.9
-ARG CUDA_VERSION=12.4
-ARG MAMBA_VERSION=23.1.0-1
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
+ARG PYTHON_VERSION=3.10
+ARG MAMBA_VERSION=24.7.1-0
 ARG TARGETPLATFORM
-
 ENV PATH=/opt/conda/bin:$PATH \
     CONDA_PREFIX=/opt/conda
 
@@ -21,7 +19,7 @@ RUN case ${TARGETPLATFORM} in \
     "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
     *)              MAMBA_ARCH=x86_64   ;; \
     esac && \
-    curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \
+    curl -fsSL -o ~/mambaforge.sh "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \
     bash ~/mambaforge.sh -b -p /opt/conda && \
     rm ~/mambaforge.sh
 
@@ -36,11 +34,14 @@ RUN case ${TARGETPLATFORM} in \
 WORKDIR /root
 
 COPY ./requirements.txt /lightllm/requirements.txt
-RUN pip install -r /lightllm/requirements.txt --no-cache-dir --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124
+RUN pip install -U pip
+RUN pip install -r /lightllm/requirements.txt --no-cache-dir
+
+RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly 
 
-RUN pip install --no-cache-dir https://github.com/ModelTC/flash-attn-3-build/releases/download/v2.7.4.post1/flash_attn-3.0.0b1-cp39-cp39-linux_x86_64.whl
+RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
 
-RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1  # for allreduce hang issues in multinode H100
+RUN apt-get update && apt-get install -y libnuma-dev # for sgl_kernel
 
 COPY . /lightllm
 RUN pip install -e /lightllm --no-cache-dir
diff --git a/docker/Dockerfile.deepep b/docker/Dockerfile.deepep
@@ -1,10 +1,8 @@
-FROM nvcr.io/nvidia/tritonserver:24.04-py3-min as base
-ARG PYTORCH_VERSION=2.6.0
-ARG PYTHON_VERSION=3.9
-ARG CUDA_VERSION=12.4
-ARG MAMBA_VERSION=23.1.0-1
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
+ARG PYTHON_VERSION=3.10
+ARG MAMBA_VERSION=24.7.1-0
 ARG TARGETPLATFORM
-
 ENV PATH=/opt/conda/bin:$PATH \
     CONDA_PREFIX=/opt/conda
 
@@ -21,7 +19,7 @@ RUN case ${TARGETPLATFORM} in \
     "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
     *)              MAMBA_ARCH=x86_64   ;; \
     esac && \
-    curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \
+    curl -fsSL -o ~/mambaforge.sh "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \
     bash ~/mambaforge.sh -b -p /opt/conda && \
     rm ~/mambaforge.sh
 
@@ -36,39 +34,46 @@ RUN case ${TARGETPLATFORM} in \
 WORKDIR /root
 
 COPY ./requirements.txt /lightllm/requirements.txt
-RUN pip install -r /lightllm/requirements.txt --no-cache-dir --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124
+RUN pip install -U pip
+RUN pip install -r /lightllm/requirements.txt --no-cache-dir
 
-RUN pip install --no-cache-dir https://github.com/ModelTC/flash-attn-3-build/releases/download/v2.7.4.post1/flash_attn-3.0.0b1-cp39-cp39-linux_x86_64.whl
+RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly 
 
-RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1  # for allreduce hang issues in multinode H100
+RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
 
-RUN git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git
-RUN cd DeepGEMM && python setup.py install
+RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms
+RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev
 
-WORKDIR /root
-RUN git clone https://github.com/deepseek-ai/DeepEP.git
+ENV CUDA_HOME=/usr/local/cuda \
+    GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
 
-# NVSHMEM
-RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
-RUN tar -xf nvshmem_src_3.2.5-1.txz \
-    && mv nvshmem_src nvshmem
+RUN mkdir -p /tmp/gdrcopy && cd /tmp \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
+ && cd gdrcopy/packages \
+ && CUDA=/usr/local/cuda ./build-deb-packages.sh \
+ && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
+ && cd / && rm -rf /tmp/gdrcopy
 
-WORKDIR /root/nvshmem
-RUN git apply /root/DeepEP/third-party/nvshmem.patch
+ # Fix DeepEP IBGDA symlink
+RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
 
-WORKDIR /root/nvshmem
-ENV CUDA_HOME=/usr/local/cuda
-RUN NVSHMEM_SHMEM_SUPPORT=0 \
+RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
+ && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
+ && cd nvshmem \
+ && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
+ && NVSHMEM_SHMEM_SUPPORT=0 \
     NVSHMEM_UCX_SUPPORT=0 \
     NVSHMEM_USE_NCCL=0 \
     NVSHMEM_MPI_SUPPORT=0 \
     NVSHMEM_IBGDA_SUPPORT=1 \
     NVSHMEM_PMIX_SUPPORT=0 \
     NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
     NVSHMEM_USE_GDRCOPY=1 \
-    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 -DMLX5_lib=/usr/lib/x86_64-linux-gnu/libmlx5.so.1 \
-    && cd build \
-    && make install -j64
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \
+ && cmake --build build --target install -j64
+
+ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58
+RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..
 
 WORKDIR /root/DeepEP
 ENV NVSHMEM_DIR=/root/nvshmem/install

diff --git a/lightllm/models/qwen2_vl/vision_process.py b/lightllm/models/qwen2_vl/vision_process.py
@@ -44,7 +44,6 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    VideoInput,
     get_image_size,
     infer_channel_dimension_format,
     is_scaled_image,
@@ -54,6 +53,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from transformers.video_utils import VideoInput
 from transformers.utils import TensorType, is_vision_available, logging
 
 logger = logging.get_logger(__name__)

diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -152,7 +152,7 @@ def _flash_attention_triton_fwd(
 
 _flash_attn_v3_available = False
 try:
-    from flash_attn_interface import _flash_attn_forward
+    from sgl_kernel.flash_attn import flash_attn_varlen_func
 
     _flash_attn_v3_available = True
 
@@ -166,36 +166,43 @@ def flash_attention_v3_fwd(
     ):
         head_dim = q.shape[-1]
         softmax_scale = head_dim ** -0.5
-        _flash_attn_forward(
+        window_size = (-1, -1)
+        torch.ops.sgl_kernel.fwd.default(
             q,
             k,
             v,
-            None,
-            None,  # k_new, v_new
+            None,  # k_new
+            None,  # v_new
+            None,  # qv
             o,  # out
             cu_seqlens,
             cu_seqlens,
-            None,  # cu_seqlens_q/k/k_new
-            None,
-            None,  # seqused_q/k
-            max_seqlen,
-            max_seqlen,  # max_seqlen_q/k
-            None,
+            None,  # cu_seqlens_k_new
             None,
-            None,  # page_table, kv_batch_idx, leftpad_k,
             None,
-            None,  # rotary_cos/sin
+            max_seqlen,
+            max_seqlen,
+            None,  # page_table,
+            None,  # kv_batch_idx
+            None,  # leftpad_k
+            None,  # rotary cos
+            None,  # rotary sin
+            None,  # seqlens_rotary
             None,
             None,
             None,
             softmax_scale,
-            False,  # causal
-            window_size=(-1, -1),
-            softcap=0.0,
+            False,
+            window_size[0],
+            window_size[1],
+            0.0,
+            is_rotary_interleaved=False,
+            scheduler_metadata=None,
             num_splits=1,
             pack_gqa=None,
             sm_margin=0,
         )
+
         return
 
 except ImportError:
@@ -205,10 +212,10 @@ def flash_attention_v3_fwd(
 
 def flash_attention_fwd(q, k, v, o, cu_seqlens, max_seqlen):
     """
-    统一的 Flash Attention 接口。如果 _flash_attn_forward 存在，
-    则使用 flash_attention_v3_fwd，否则使用 Triton 版本。
+    统一的 Flash Attention 接口。如果 sgl_kernel 存在，
+    则使用 sgl_kernel里的接口，否则使用 Triton 版本。
     """
-    if _flash_attn_v3_available and is_hopper():
+    if _flash_attn_v3_available and is_hopper() and False:
-    if _flash_attn_v3_available and is_hopper() and False:
+    if _flash_attn_v3_available and is_hopper():
-    if _flash_attn_v3_available and is_hopper() and False:
+    if _flash_attn_v3_available and is_hopper():
         flash_attention_v3_fwd(q, k, v, o, cu_seqlens, max_seqlen)
     else:
         _flash_attention_triton_fwd(q, k, v, o, cu_seqlens, max_seqlen)
diff --git a/requirements.txt b/requirements.txt
@@ -34,7 +34,7 @@ multiprocessing-logging==0.3.4
 networkx==3.1
 ninja==1.11.1
 numpy==1.25.1
-packaging==23.1
+packaging==24.2
 pip==23.0.1
 pluggy==1.2.0
 plumbum==1.8.2
@@ -54,18 +54,15 @@ ruamel.yaml==0.17.32
 ruamel.yaml.clib==0.2.7
 s3transfer==0.6.1
 sentencepiece==0.2.0
-setuptools==65.6.3
+setuptools==77.0.3
 six==1.16.0
 sniffio==1.3.0
-sympy==1.13.1
 sortedcontainers==2.4.0
 toolz==0.12.0
-torch==2.6.0
-torchvision==0.21.0
+torch==2.7.1
 tqdm==4.65.0
 transformers==4.51.2
 tokenizers==0.21.1
-triton==3.2.0
 urllib3==1.26.16
 uvicorn==0.19.0
 uvloop==0.17.0
@@ -83,9 +80,8 @@ frozendict==2.4.6
 atomics==1.0.3
 easydict==1.13
 gunicorn==23.0.0
-vllm==0.8.5
 flashinfer-python==0.2.4
-sgl-kernel==0.1.4
+sgl-kernel==0.2.6
 httpx==0.28.1
 librosa==0.11.0
 cuda_bindings==12.9.0