diff --git a/docker/Dockerfile b/docker/Dockerfile index 9392ec268..653b227a8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,10 +1,8 @@ -FROM nvcr.io/nvidia/tritonserver:24.04-py3-min as base -ARG PYTORCH_VERSION=2.6.0 -ARG PYTHON_VERSION=3.9 -ARG CUDA_VERSION=12.4 -ARG MAMBA_VERSION=23.1.0-1 +ARG CUDA_VERSION=12.6.1 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 +ARG PYTHON_VERSION=3.10 +ARG MAMBA_VERSION=24.7.1-0 ARG TARGETPLATFORM - ENV PATH=/opt/conda/bin:$PATH \ CONDA_PREFIX=/opt/conda @@ -21,7 +19,7 @@ RUN case ${TARGETPLATFORM} in \ "linux/arm64") MAMBA_ARCH=aarch64 ;; \ *) MAMBA_ARCH=x86_64 ;; \ esac && \ - curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ + curl -fsSL -o ~/mambaforge.sh "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ bash ~/mambaforge.sh -b -p /opt/conda && \ rm ~/mambaforge.sh @@ -36,11 +34,14 @@ RUN case ${TARGETPLATFORM} in \ WORKDIR /root COPY ./requirements.txt /lightllm/requirements.txt -RUN pip install -r /lightllm/requirements.txt --no-cache-dir --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124 +RUN pip install -U pip +RUN pip install -r /lightllm/requirements.txt --no-cache-dir + +RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly -RUN pip install --no-cache-dir https://github.com/ModelTC/flash-attn-3-build/releases/download/v2.7.4.post1/flash_attn-3.0.0b1-cp39-cp39-linux_x86_64.whl +RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . -RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1 # for allreduce hang issues in multinode H100 +RUN apt-get update && apt-get install -y libnuma-dev # for sgl_kernel COPY . /lightllm RUN pip install -e /lightllm --no-cache-dir diff --git a/docker/Dockerfile.deepep b/docker/Dockerfile.deepep index fef3b757a..058181e63 100644 --- a/docker/Dockerfile.deepep +++ b/docker/Dockerfile.deepep @@ -1,10 +1,8 @@ -FROM nvcr.io/nvidia/tritonserver:24.04-py3-min as base -ARG PYTORCH_VERSION=2.6.0 -ARG PYTHON_VERSION=3.9 -ARG CUDA_VERSION=12.4 -ARG MAMBA_VERSION=23.1.0-1 +ARG CUDA_VERSION=12.6.1 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 +ARG PYTHON_VERSION=3.10 +ARG MAMBA_VERSION=24.7.1-0 ARG TARGETPLATFORM - ENV PATH=/opt/conda/bin:$PATH \ CONDA_PREFIX=/opt/conda @@ -21,7 +19,7 @@ RUN case ${TARGETPLATFORM} in \ "linux/arm64") MAMBA_ARCH=aarch64 ;; \ *) MAMBA_ARCH=x86_64 ;; \ esac && \ - curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ + curl -fsSL -o ~/mambaforge.sh "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ bash ~/mambaforge.sh -b -p /opt/conda && \ rm ~/mambaforge.sh @@ -36,29 +34,34 @@ RUN case ${TARGETPLATFORM} in \ WORKDIR /root COPY ./requirements.txt /lightllm/requirements.txt -RUN pip install -r /lightllm/requirements.txt --no-cache-dir --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124 +RUN pip install -U pip +RUN pip install -r /lightllm/requirements.txt --no-cache-dir -RUN pip install --no-cache-dir https://github.com/ModelTC/flash-attn-3-build/releases/download/v2.7.4.post1/flash_attn-3.0.0b1-cp39-cp39-linux_x86_64.whl +RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly -RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1 # for allreduce hang issues in multinode H100 +RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v . -RUN git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git -RUN cd DeepGEMM && python setup.py install +RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms +RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev -WORKDIR /root -RUN git clone https://github.com/deepseek-ai/DeepEP.git +ENV CUDA_HOME=/usr/local/cuda \ + GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ -# NVSHMEM -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz -RUN tar -xf nvshmem_src_3.2.5-1.txz \ - && mv nvshmem_src nvshmem +RUN mkdir -p /tmp/gdrcopy && cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ + && cd gdrcopy/packages \ + && CUDA=/usr/local/cuda ./build-deb-packages.sh \ + && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ + && cd / && rm -rf /tmp/gdrcopy -WORKDIR /root/nvshmem -RUN git apply /root/DeepEP/third-party/nvshmem.patch + # Fix DeepEP IBGDA symlink +RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so -WORKDIR /root/nvshmem -ENV CUDA_HOME=/usr/local/cuda -RUN NVSHMEM_SHMEM_SUPPORT=0 \ +RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ + && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \ + && cd nvshmem \ + && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ + && NVSHMEM_SHMEM_SUPPORT=0 \ NVSHMEM_UCX_SUPPORT=0 \ NVSHMEM_USE_NCCL=0 \ NVSHMEM_MPI_SUPPORT=0 \ @@ -66,9 +69,11 @@ RUN NVSHMEM_SHMEM_SUPPORT=0 \ NVSHMEM_PMIX_SUPPORT=0 \ NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 -DMLX5_lib=/usr/lib/x86_64-linux-gnu/libmlx5.so.1 \ - && cd build \ - && make install -j64 + cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \ + && cmake --build build --target install -j64 + +ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58 +RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. WORKDIR /root/DeepEP ENV NVSHMEM_DIR=/root/nvshmem/install diff --git a/lightllm/models/qwen2_vl/vision_process.py b/lightllm/models/qwen2_vl/vision_process.py index 45c250378..9366ca747 100644 --- a/lightllm/models/qwen2_vl/vision_process.py +++ b/lightllm/models/qwen2_vl/vision_process.py @@ -44,7 +44,6 @@ ChannelDimension, ImageInput, PILImageResampling, - VideoInput, get_image_size, infer_channel_dimension_format, is_scaled_image, @@ -54,6 +53,7 @@ valid_images, validate_preprocess_arguments, ) +from transformers.video_utils import VideoInput from transformers.utils import TensorType, is_vision_available, logging logger = logging.get_logger(__name__) diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py index 34e7ed6be..ab3770a36 100644 --- a/lightllm/models/vit/triton_kernel/flashattention_nopad.py +++ b/lightllm/models/vit/triton_kernel/flashattention_nopad.py @@ -152,7 +152,7 @@ def _flash_attention_triton_fwd( _flash_attn_v3_available = False try: - from flash_attn_interface import _flash_attn_forward + from sgl_kernel.flash_attn import flash_attn_varlen_func _flash_attn_v3_available = True @@ -166,36 +166,43 @@ def flash_attention_v3_fwd( ): head_dim = q.shape[-1] softmax_scale = head_dim ** -0.5 - _flash_attn_forward( + window_size = (-1, -1) + torch.ops.sgl_kernel.fwd.default( q, k, v, - None, - None, # k_new, v_new + None, # k_new + None, # v_new + None, # qv o, # out cu_seqlens, cu_seqlens, - None, # cu_seqlens_q/k/k_new - None, - None, # seqused_q/k - max_seqlen, - max_seqlen, # max_seqlen_q/k - None, + None, # cu_seqlens_k_new None, - None, # page_table, kv_batch_idx, leftpad_k, None, - None, # rotary_cos/sin + max_seqlen, + max_seqlen, + None, # page_table, + None, # kv_batch_idx + None, # leftpad_k + None, # rotary cos + None, # rotary sin + None, # seqlens_rotary None, None, None, softmax_scale, - False, # causal - window_size=(-1, -1), - softcap=0.0, + False, + window_size[0], + window_size[1], + 0.0, + is_rotary_interleaved=False, + scheduler_metadata=None, num_splits=1, pack_gqa=None, sm_margin=0, ) + return except ImportError: @@ -205,10 +212,10 @@ def flash_attention_v3_fwd( def flash_attention_fwd(q, k, v, o, cu_seqlens, max_seqlen): """ - 统一的 Flash Attention 接口。如果 _flash_attn_forward 存在, - 则使用 flash_attention_v3_fwd,否则使用 Triton 版本。 + 统一的 Flash Attention 接口。如果 sgl_kernel 存在, + 则使用 sgl_kernel里的接口,否则使用 Triton 版本。 """ - if _flash_attn_v3_available and is_hopper(): + if _flash_attn_v3_available and is_hopper() and False: flash_attention_v3_fwd(q, k, v, o, cu_seqlens, max_seqlen) else: _flash_attention_triton_fwd(q, k, v, o, cu_seqlens, max_seqlen) diff --git a/requirements.txt b/requirements.txt index 1febb64f1..6287a5d01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,7 +34,7 @@ multiprocessing-logging==0.3.4 networkx==3.1 ninja==1.11.1 numpy==1.25.1 -packaging==23.1 +packaging==24.2 pip==23.0.1 pluggy==1.2.0 plumbum==1.8.2 @@ -54,18 +54,15 @@ ruamel.yaml==0.17.32 ruamel.yaml.clib==0.2.7 s3transfer==0.6.1 sentencepiece==0.2.0 -setuptools==65.6.3 +setuptools==77.0.3 six==1.16.0 sniffio==1.3.0 -sympy==1.13.1 sortedcontainers==2.4.0 toolz==0.12.0 -torch==2.6.0 -torchvision==0.21.0 +torch==2.7.1 tqdm==4.65.0 transformers==4.51.2 tokenizers==0.21.1 -triton==3.2.0 urllib3==1.26.16 uvicorn==0.19.0 uvloop==0.17.0 @@ -83,9 +80,8 @@ frozendict==2.4.6 atomics==1.0.3 easydict==1.13 gunicorn==23.0.0 -vllm==0.8.5 flashinfer-python==0.2.4 -sgl-kernel==0.1.4 +sgl-kernel==0.2.6 httpx==0.28.1 librosa==0.11.0 cuda_bindings==12.9.0 \ No newline at end of file