From 9aafcdba521ec2bcf379dae599c03443e4e67b0a Mon Sep 17 00:00:00 2001
From: Christopher Kodama <ckodama@redhat.com>
Date: Thu, 27 Mar 2025 08:45:20 -0500
Subject: [PATCH 1/2] Revert "Build VLLM CUDA from RHEL AI wheels, add audio
 and video packages (#85)"

This reverts commit 7a8e29a0e88967f069df62d4ac604ee6172bfb2e.
---
 .tekton/vllm-cuda-v2-19-push.yaml |   6 --
 Dockerfile.ubi                    | 138 ++++++++++++++++++++----------
 argfile.konflux                   |   7 --
 payload/run.sh                    |  34 --------
 4 files changed, 91 insertions(+), 94 deletions(-)
 delete mode 100644 argfile.konflux
 delete mode 100755 payload/run.sh

diff --git a/.tekton/vllm-cuda-v2-19-push.yaml b/.tekton/vllm-cuda-v2-19-push.yaml
index 527dee19f3b3..348585d50bdc 100644
--- a/.tekton/vllm-cuda-v2-19-push.yaml
+++ b/.tekton/vllm-cuda-v2-19-push.yaml
@@ -30,10 +30,6 @@ spec:
     value: Dockerfile.ubi
   - name: path-context
     value: .
-  - name: additional-build-secret
-    value: rhel-ai-private-index-auth
-  - name: build-args-file
-    value: argfile.konflux
   taskRunSpecs:
     - pipelineTaskName: ecosystem-cert-preflight-checks
       computeResources:
@@ -298,8 +294,6 @@ spec:
           - $(params.build-platforms)
       name: build-images
       params:
-      - name: ADDITIONAL_SECRET
-        value: $(params.additional-build-secret)
       - name: IMAGE
         value: $(params.output-image)
       - name: DOCKERFILE
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index e84473d21e0e..82c438a63e4f 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -1,9 +1,12 @@
+## Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
+ARG PYTHON_VERSION=3.12
 
-ARG BASE_UBI_IMAGE_TAG
-ARG PYTHON_VERSION
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
 
 ## Base Layer ##################################################################
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
 ARG PYTHON_VERSION
 ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN microdnf -y update && microdnf install -y --nodocs \
@@ -16,14 +19,13 @@ ENV LANG=C.UTF-8 \
     LC_ALL=C.UTF-8
 
 # Some utils for dev purposes - tar required for kubectl cp
-
 RUN microdnf install -y --nodocs \
-        which procps findutils tar vim git \
+        which procps findutils tar vim git\
     && microdnf clean all
 
 
 ## Python Installer ############################################################
-FROM base AS python-install
+FROM base as python-install
 ARG PYTHON_VERSION
 
 ENV VIRTUAL_ENV=/opt/vllm
@@ -31,13 +33,11 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN microdnf install -y --nodocs \
     python${PYTHON_VERSION}-devel  && \
-    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
-    pip install --no-cache -U pip wheel uv && \
-    microdnf clean all
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
 
 
 ## CUDA Base ###################################################################
-FROM python-install AS cuda-base
+FROM python-install as cuda-base
 
 RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
         https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
@@ -51,6 +51,7 @@ RUN microdnf install -y --nodocs \
     ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/
 
 
+
 ## Python cuda base #################################################################
 FROM cuda-base AS python-cuda-base
 
@@ -58,23 +59,80 @@ ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # install cuda and common dependencies
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
     uv pip install \
         -r requirements-cuda.txt
 
 
+## Development #################################################################
+FROM python-cuda-base AS dev
+
+# install build and runtime dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
+    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
+    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
+    uv pip install \
+        -r requirements-cuda.txt \
+        -r requirements-dev.txt
+
+## Builder #####################################################################
+FROM dev AS build
+
+# install build dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
+    uv pip install -r requirements-build.txt
+
+# install compiler cache to speed up compilation leveraging local or remote caching
+# git is required for the cutlass kernels
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all
+
+COPY . .
+
+ARG TORCH_CUDA_ARCH_LIST
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+ARG vllm_fa_cmake_gpu_arches
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+# Make sure the cuda environment is in the PATH
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=.git,target=/workspace/.git \
+    env CFLAGS="-march=haswell" \
+        CXXFLAGS="$CFLAGS $CXXFLAGS" \
+        CMAKE_BUILD_TYPE=Release \
+        python3 setup.py bdist_wheel --dist-dir=dist
 
 #################### libsodium Build IMAGE ####################
-FROM base AS libsodium-builder
+FROM base as libsodium-builder
 
 RUN microdnf install -y --nodocs gcc gzip \
     && microdnf clean all
 
 WORKDIR /usr/src/libsodium
 
-ARG LIBSODIUM_VERSION
+ARG LIBSODIUM_VERSION=1.0.20
 RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
     && tar -xzvf libsodium*.tar.gz \
     && rm -f libsodium*.tar.gz \
@@ -98,32 +156,25 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv
 ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
 
 # Triton needs a CC compiler
-
 RUN microdnf install -y --nodocs gcc \
     rsync \
     && microdnf clean all
 
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose
 
 # Install libsodium for Tensorizer encryption
 RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
-    make -C /usr/src/libsodium install
+    cd /usr/src/libsodium \
+    && make install
 
-COPY LICENSE /licenses/vllm.md
-COPY examples/*.jinja /app/data/template/
-
-# install vllm by running the payload script and then install flashinfer
-
-ARG VLLM_WHEEL_VERSION
-ARG VLLM_WHEEL_INDEX
-ARG FLASHINFER_VERSION
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=payload,target=/workspace/payload \
-    --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
-        env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
-            VLLM_WHEEL_VERSION=${VLLM_VERSION} \
-            VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
-        ./payload/run.sh && \
-        uv pip install "${FLASHINFER_VERSION}" 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install \
+        "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"
 
 ENV HF_HUB_OFFLINE=1 \
     HOME=/home/vllm \
@@ -148,7 +199,10 @@ ENV HF_HUB_OFFLINE=1 \
 RUN umask 002 && \
     useradd --uid 2000 --gid 0 vllm && \
     mkdir -p /home/vllm && \
-    chmod g+rwx /home/vllm
+    chmod g+rwx /home/vllm /usr/src /workspace
+
+COPY LICENSE /licenses/vllm.md
+COPY examples/*.jinja /app/data/template/
 
 USER 2000
 WORKDIR /home/vllm
@@ -156,24 +210,14 @@ WORKDIR /home/vllm
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 
 
-## TGIS Adapter layer #####################################################################
-FROM vllm-openai AS vllm-grpc-adapter
+FROM vllm-openai as vllm-grpc-adapter
 
 USER root
 
-ARG VLLM_TGIS_ADAPTER_VERSION
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=payload,target=/workspace/payload \
-    --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
-    cd /workspace && \
-    ls && \
-    env HOME=/root \
-        BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
-        VLLM_WHEEL_VERSION=${VLLM_VERSION} \
-        VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \
-        VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
-        ./payload/run.sh
-
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \
diff --git a/argfile.konflux b/argfile.konflux
deleted file mode 100644
index 3d24e5066ff7..000000000000
--- a/argfile.konflux
+++ /dev/null
@@ -1,7 +0,0 @@
-BASE_UBI_IMAGE_TAG=9.5-1739420147
-PYTHON_VERSION=3.11
-LIBSODIUM_VERSION=1.0.20
-VLLM_TGIS_ADAPTER_VERSION=0.6.3
-FLASHINFER_VERSION=https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
-VLLM_WHEEL_VERSION=0.7.2
-VLLM_WHEEL_INDEX=https://gitlab.com/api/v4/projects/66664052/packages/pypi/simple
diff --git a/payload/run.sh b/payload/run.sh
deleted file mode 100755
index 7d0c62fe3e16..000000000000
--- a/payload/run.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-# required env vars:
-# $BOT_PAT
-# $WHEEL_RELEASE_ARTIFACTS
-# optional:
-# $VLLM_TGIS_ADAPTER_VERSION
-# $VLLM_WHEEL_VERSION
-set -ex
-
-cat <<EOF > ${HOME}/.netrc
-machine gitlab.com
-login rhel-ai-wheels-prefetch-token-rhoai 
-password $BOT_PAT
-EOF
-
-trap "rm ${HOME}/.netrc" EXIT
-
-# https://docs.astral.sh/uv/configuration/indexes/#searching-across-multiple-indexes
-# This will prefer to use the custom index, and fall back to pypi if needed
-export UV_EXTRA_INDEX_URL=${VLLM_WHEEL_INDEX}
-export UV_INDEX_STRATEGY=unsafe-first-match 
-
-vllm="vllm[tensorizer,audio,video]"
-
-if [[ -n "$VLLM_TGIS_ADAPTER_VERSION" ]]; then
-    vllm_tgis_adapter="vllm-tgis-adapter==${VLLM_TGIS_ADAPTER_VERSION}"
-fi
-
-if [[ -n "$VLLM_WHEEL_VERSION" ]]; then
-    vllm="${vllm}==${$VLLM_WHEEL_VERSION}"
-fi
-
-uv pip install $vllm $vllm_tgis_adapter
-

From 7597604806090ec31133da77628eb247e9046cdc Mon Sep 17 00:00:00 2001
From: Christopher Kodama <ckodama@redhat.com>
Date: Thu, 27 Mar 2025 08:51:20 -0500
Subject: [PATCH 2/2] [RHOAIRFE-532] add audio/video support to CUDA

---
 Dockerfile.ubi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 82c438a63e4f..0f46ed0cee96 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -164,7 +164,7 @@ RUN microdnf install -y --nodocs gcc \
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose
+    uv pip install "$(echo dist/*.whl)[audio,video,tensorizer]" --verbose
 
 # Install libsodium for Tensorizer encryption
 RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
@@ -217,7 +217,7 @@ USER root
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
-    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3
+    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[audio,video,tensorizer]" vllm-tgis-adapter==0.6.3
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \