red-hat-data-services · ckhordiasma · Mar 27, 2025 · Mar 27, 2025
diff --git a/.tekton/vllm-cuda-v2-19-push.yaml b/.tekton/vllm-cuda-v2-19-push.yaml
@@ -30,10 +30,6 @@ spec:
     value: Dockerfile.ubi
   - name: path-context
     value: .
-  - name: additional-build-secret
-    value: rhel-ai-private-index-auth
-  - name: build-args-file
-    value: argfile.konflux
   taskRunSpecs:
     - pipelineTaskName: ecosystem-cert-preflight-checks
       computeResources:
@@ -298,8 +294,6 @@ spec:
           - $(params.build-platforms)
       name: build-images
       params:
-      - name: ADDITIONAL_SECRET
-        value: $(params.additional-build-secret)
       - name: IMAGE
         value: $(params.output-image)
       - name: DOCKERFILE

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -1,9 +1,12 @@
+## Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
+ARG PYTHON_VERSION=3.12
 
-ARG BASE_UBI_IMAGE_TAG
-ARG PYTHON_VERSION
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
 
 ## Base Layer ##################################################################
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
 ARG PYTHON_VERSION
 ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN microdnf -y update && microdnf install -y --nodocs \
@@ -16,28 +19,25 @@ ENV LANG=C.UTF-8 \
     LC_ALL=C.UTF-8
 
 # Some utils for dev purposes - tar required for kubectl cp
-
 RUN microdnf install -y --nodocs \
-        which procps findutils tar vim git \
+        which procps findutils tar vim git\
     && microdnf clean all
 
 
 ## Python Installer ############################################################
-FROM base AS python-install
+FROM base as python-install
 ARG PYTHON_VERSION
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN microdnf install -y --nodocs \
     python${PYTHON_VERSION}-devel  && \
-    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
-    pip install --no-cache -U pip wheel uv && \
-    microdnf clean all
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
 
 
 ## CUDA Base ###################################################################
-FROM python-install AS cuda-base
+FROM python-install as cuda-base
 
 RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
         https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
@@ -51,30 +51,88 @@ RUN microdnf install -y --nodocs \
     ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/
 
 
+
 ## Python cuda base #################################################################
 FROM cuda-base AS python-cuda-base
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # install cuda and common dependencies
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
     uv pip install \
         -r requirements-cuda.txt
 
 
+## Development #################################################################
+FROM python-cuda-base AS dev
+
+# install build and runtime dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
+    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
+    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
+    uv pip install \
+        -r requirements-cuda.txt \
+        -r requirements-dev.txt
+
+## Builder #####################################################################
+FROM dev AS build
+
+# install build dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
+    uv pip install -r requirements-build.txt
+
+# install compiler cache to speed up compilation leveraging local or remote caching
+# git is required for the cutlass kernels
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all
+
+COPY . .
+
+ARG TORCH_CUDA_ARCH_LIST
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+ARG vllm_fa_cmake_gpu_arches
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+# Make sure the cuda environment is in the PATH
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=.git,target=/workspace/.git \
+    env CFLAGS="-march=haswell" \
+        CXXFLAGS="$CFLAGS $CXXFLAGS" \
+        CMAKE_BUILD_TYPE=Release \
+        python3 setup.py bdist_wheel --dist-dir=dist
 
 #################### libsodium Build IMAGE ####################
-FROM base AS libsodium-builder
+FROM base as libsodium-builder
 
 RUN microdnf install -y --nodocs gcc gzip \
     && microdnf clean all
 
 WORKDIR /usr/src/libsodium
 
-ARG LIBSODIUM_VERSION
+ARG LIBSODIUM_VERSION=1.0.20
 RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
     && tar -xzvf libsodium*.tar.gz \
     && rm -f libsodium*.tar.gz \
@@ -98,32 +156,25 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv
 ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
 
 # Triton needs a CC compiler
-
 RUN microdnf install -y --nodocs gcc \
     rsync \
     && microdnf clean all
 
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install "$(echo dist/*.whl)[audio,video,tensorizer]" --verbose
 
 # Install libsodium for Tensorizer encryption
 RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
-    make -C /usr/src/libsodium install
+    cd /usr/src/libsodium \
+    && make install
 
-COPY LICENSE /licenses/vllm.md
-COPY examples/*.jinja /app/data/template/
-
-# install vllm by running the payload script and then install flashinfer
-
-ARG VLLM_WHEEL_VERSION
-ARG VLLM_WHEEL_INDEX
-ARG FLASHINFER_VERSION
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=payload,target=/workspace/payload \
-    --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
-        env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
-            VLLM_WHEEL_VERSION=${VLLM_VERSION} \
-            VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
-        ./payload/run.sh && \
-        uv pip install "${FLASHINFER_VERSION}" 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install \
+        "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"
 
 ENV HF_HUB_OFFLINE=1 \
     HOME=/home/vllm \
@@ -148,32 +199,25 @@ ENV HF_HUB_OFFLINE=1 \
 RUN umask 002 && \
     useradd --uid 2000 --gid 0 vllm && \
     mkdir -p /home/vllm && \
-    chmod g+rwx /home/vllm
+    chmod g+rwx /home/vllm /usr/src /workspace
+
+COPY LICENSE /licenses/vllm.md
+COPY examples/*.jinja /app/data/template/
 
 USER 2000
 WORKDIR /home/vllm
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 
 
-## TGIS Adapter layer #####################################################################
-FROM vllm-openai AS vllm-grpc-adapter
+FROM vllm-openai as vllm-grpc-adapter
 
 USER root
 
-ARG VLLM_TGIS_ADAPTER_VERSION
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=payload,target=/workspace/payload \
-    --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
-    cd /workspace && \
-    ls && \
-    env HOME=/root \
-        BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
-        VLLM_WHEEL_VERSION=${VLLM_VERSION} \
-        VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \
-        VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
-        ./payload/run.sh
-
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[audio,video,tensorizer]" vllm-tgis-adapter==0.6.3
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \

diff --git a/argfile.konflux b/argfile.konflux
diff --git a/payload/run.sh b/payload/run.sh