From 9aafcdba521ec2bcf379dae599c03443e4e67b0a Mon Sep 17 00:00:00 2001 From: Christopher Kodama Date: Thu, 27 Mar 2025 08:45:20 -0500 Subject: [PATCH 1/2] Revert "Build VLLM CUDA from RHEL AI wheels, add audio and video packages (#85)" This reverts commit 7a8e29a0e88967f069df62d4ac604ee6172bfb2e. --- .tekton/vllm-cuda-v2-19-push.yaml | 6 -- Dockerfile.ubi | 138 ++++++++++++++++++++---------- argfile.konflux | 7 -- payload/run.sh | 34 -------- 4 files changed, 91 insertions(+), 94 deletions(-) delete mode 100644 argfile.konflux delete mode 100755 payload/run.sh diff --git a/.tekton/vllm-cuda-v2-19-push.yaml b/.tekton/vllm-cuda-v2-19-push.yaml index 527dee19f3b3..348585d50bdc 100644 --- a/.tekton/vllm-cuda-v2-19-push.yaml +++ b/.tekton/vllm-cuda-v2-19-push.yaml @@ -30,10 +30,6 @@ spec: value: Dockerfile.ubi - name: path-context value: . - - name: additional-build-secret - value: rhel-ai-private-index-auth - - name: build-args-file - value: argfile.konflux taskRunSpecs: - pipelineTaskName: ecosystem-cert-preflight-checks computeResources: @@ -298,8 +294,6 @@ spec: - $(params.build-platforms) name: build-images params: - - name: ADDITIONAL_SECRET - value: $(params.additional-build-secret) - name: IMAGE value: $(params.output-image) - name: DOCKERFILE diff --git a/Dockerfile.ubi b/Dockerfile.ubi index e84473d21e0e..82c438a63e4f 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -1,9 +1,12 @@ +## Global Args ################################################################# +ARG BASE_UBI_IMAGE_TAG=9.5-1741850109 +ARG PYTHON_VERSION=3.12 -ARG BASE_UBI_IMAGE_TAG -ARG PYTHON_VERSION +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" +ARG vllm_fa_cmake_gpu_arches='80-real;90-real' ## Base Layer ################################################################## -FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base ARG PYTHON_VERSION ENV PYTHON_VERSION=${PYTHON_VERSION} RUN microdnf -y update && microdnf install -y --nodocs \ @@ -16,14 +19,13 @@ ENV LANG=C.UTF-8 \ LC_ALL=C.UTF-8 # Some utils for dev purposes - tar required for kubectl cp - RUN microdnf install -y --nodocs \ - which procps findutils tar vim git \ + which procps findutils tar vim git\ && microdnf clean all ## Python Installer ############################################################ -FROM base AS python-install +FROM base as python-install ARG PYTHON_VERSION ENV VIRTUAL_ENV=/opt/vllm @@ -31,13 +33,11 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH" ENV PYTHON_VERSION=${PYTHON_VERSION} RUN microdnf install -y --nodocs \ python${PYTHON_VERSION}-devel && \ - python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \ - pip install --no-cache -U pip wheel uv && \ - microdnf clean all + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all ## CUDA Base ################################################################### -FROM python-install AS cuda-base +FROM python-install as cuda-base RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo @@ -51,6 +51,7 @@ RUN microdnf install -y --nodocs \ ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/ + ## Python cuda base ################################################################# FROM cuda-base AS python-cuda-base @@ -58,23 +59,80 @@ ENV VIRTUAL_ENV=/opt/vllm ENV PATH="$VIRTUAL_ENV/bin:$PATH" # install cuda and common dependencies -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ uv pip install \ -r requirements-cuda.txt +## Development ################################################################# +FROM python-cuda-base AS dev + +# install build and runtime dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ + --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ + --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ + --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ + uv pip install \ + -r requirements-cuda.txt \ + -r requirements-dev.txt + +## Builder ##################################################################### +FROM dev AS build + +# install build dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ + uv pip install -r requirements-build.txt + +# install compiler cache to speed up compilation leveraging local or remote caching +# git is required for the cutlass kernels +RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all + +COPY . . + +ARG TORCH_CUDA_ARCH_LIST +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST +ARG vllm_fa_cmake_gpu_arches +ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} + +# max jobs used by Ninja to build extensions +ARG max_jobs=2 +ENV MAX_JOBS=${max_jobs} +# number of threads used by nvcc +ARG nvcc_threads=8 +ENV NVCC_THREADS=$nvcc_threads +# make sure punica kernels are built (for LoRA) +ENV VLLM_INSTALL_PUNICA_KERNELS=1 + +# Make sure the cuda environment is in the PATH +ENV PATH=/usr/local/cuda/bin:$PATH + +ENV CCACHE_DIR=/root/.cache/ccache +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,src=.git,target=/workspace/.git \ + env CFLAGS="-march=haswell" \ + CXXFLAGS="$CFLAGS $CXXFLAGS" \ + CMAKE_BUILD_TYPE=Release \ + python3 setup.py bdist_wheel --dist-dir=dist #################### libsodium Build IMAGE #################### -FROM base AS libsodium-builder +FROM base as libsodium-builder RUN microdnf install -y --nodocs gcc gzip \ && microdnf clean all WORKDIR /usr/src/libsodium -ARG LIBSODIUM_VERSION +ARG LIBSODIUM_VERSION=1.0.20 RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \ && tar -xzvf libsodium*.tar.gz \ && rm -f libsodium*.tar.gz \ @@ -98,32 +156,25 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}" # Triton needs a CC compiler - RUN microdnf install -y --nodocs gcc \ rsync \ && microdnf clean all +# install vllm wheel first, so that torch etc will be installed +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose # Install libsodium for Tensorizer encryption RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ - make -C /usr/src/libsodium install + cd /usr/src/libsodium \ + && make install -COPY LICENSE /licenses/vllm.md -COPY examples/*.jinja /app/data/template/ - -# install vllm by running the payload script and then install flashinfer - -ARG VLLM_WHEEL_VERSION -ARG VLLM_WHEEL_INDEX -ARG FLASHINFER_VERSION -RUN --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,src=payload,target=/workspace/payload \ - --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \ - env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \ - VLLM_WHEEL_VERSION=${VLLM_VERSION} \ - VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \ - ./payload/run.sh && \ - uv pip install "${FLASHINFER_VERSION}" +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install \ + "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl" ENV HF_HUB_OFFLINE=1 \ HOME=/home/vllm \ @@ -148,7 +199,10 @@ ENV HF_HUB_OFFLINE=1 \ RUN umask 002 && \ useradd --uid 2000 --gid 0 vllm && \ mkdir -p /home/vllm && \ - chmod g+rwx /home/vllm + chmod g+rwx /home/vllm /usr/src /workspace + +COPY LICENSE /licenses/vllm.md +COPY examples/*.jinja /app/data/template/ USER 2000 WORKDIR /home/vllm @@ -156,24 +210,14 @@ WORKDIR /home/vllm ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] -## TGIS Adapter layer ##################################################################### -FROM vllm-openai AS vllm-grpc-adapter +FROM vllm-openai as vllm-grpc-adapter USER root -ARG VLLM_TGIS_ADAPTER_VERSION -RUN --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,src=payload,target=/workspace/payload \ - --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \ - cd /workspace && \ - ls && \ - env HOME=/root \ - BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \ - VLLM_WHEEL_VERSION=${VLLM_VERSION} \ - VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \ - VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \ - ./payload/run.sh - +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ + HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3 ENV GRPC_PORT=8033 \ PORT=8000 \ diff --git a/argfile.konflux b/argfile.konflux deleted file mode 100644 index 3d24e5066ff7..000000000000 --- a/argfile.konflux +++ /dev/null @@ -1,7 +0,0 @@ -BASE_UBI_IMAGE_TAG=9.5-1739420147 -PYTHON_VERSION=3.11 -LIBSODIUM_VERSION=1.0.20 -VLLM_TGIS_ADAPTER_VERSION=0.6.3 -FLASHINFER_VERSION=https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl -VLLM_WHEEL_VERSION=0.7.2 -VLLM_WHEEL_INDEX=https://gitlab.com/api/v4/projects/66664052/packages/pypi/simple diff --git a/payload/run.sh b/payload/run.sh deleted file mode 100755 index 7d0c62fe3e16..000000000000 --- a/payload/run.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# required env vars: -# $BOT_PAT -# $WHEEL_RELEASE_ARTIFACTS -# optional: -# $VLLM_TGIS_ADAPTER_VERSION -# $VLLM_WHEEL_VERSION -set -ex - -cat < ${HOME}/.netrc -machine gitlab.com -login rhel-ai-wheels-prefetch-token-rhoai -password $BOT_PAT -EOF - -trap "rm ${HOME}/.netrc" EXIT - -# https://docs.astral.sh/uv/configuration/indexes/#searching-across-multiple-indexes -# This will prefer to use the custom index, and fall back to pypi if needed -export UV_EXTRA_INDEX_URL=${VLLM_WHEEL_INDEX} -export UV_INDEX_STRATEGY=unsafe-first-match - -vllm="vllm[tensorizer,audio,video]" - -if [[ -n "$VLLM_TGIS_ADAPTER_VERSION" ]]; then - vllm_tgis_adapter="vllm-tgis-adapter==${VLLM_TGIS_ADAPTER_VERSION}" -fi - -if [[ -n "$VLLM_WHEEL_VERSION" ]]; then - vllm="${vllm}==${$VLLM_WHEEL_VERSION}" -fi - -uv pip install $vllm $vllm_tgis_adapter - From 7597604806090ec31133da77628eb247e9046cdc Mon Sep 17 00:00:00 2001 From: Christopher Kodama Date: Thu, 27 Mar 2025 08:51:20 -0500 Subject: [PATCH 2/2] [RHOAIRFE-532] add audio/video support to CUDA --- Dockerfile.ubi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 82c438a63e4f..0f46ed0cee96 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -164,7 +164,7 @@ RUN microdnf install -y --nodocs gcc \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose + uv pip install "$(echo dist/*.whl)[audio,video,tensorizer]" --verbose # Install libsodium for Tensorizer encryption RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ @@ -217,7 +217,7 @@ USER root RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ - HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3 + HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[audio,video,tensorizer]" vllm-tgis-adapter==0.6.3 ENV GRPC_PORT=8033 \ PORT=8000 \