red-hat-data-services · ckhordiasma · Mar 20, 2025 · Apr 11, 2025 · Apr 11, 2025 · Apr 11, 2025
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -2,11 +2,8 @@
 ARG BASE_UBI_IMAGE_TAG=9.5-1742914212
 ARG PYTHON_VERSION=3.12
 
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
-
 ## Base Layer ##################################################################
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
 ARG PYTHON_VERSION
 ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN microdnf -y update && microdnf install -y --nodocs \
@@ -19,25 +16,27 @@ ENV LANG=C.UTF-8 \
     LC_ALL=C.UTF-8
 
 # Some utils for dev purposes - tar required for kubectl cp
+
 RUN microdnf install -y --nodocs \
-        which procps findutils tar vim git\
+        which procps findutils tar vim git \
     && microdnf clean all
 
 
 ## Python Installer ############################################################
-FROM base as python-install
+FROM base AS python-install
 ARG PYTHON_VERSION
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN microdnf install -y --nodocs \
     python${PYTHON_VERSION}-devel  && \
-    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
+    microdnf clean all
 
 
 ## CUDA Base ###################################################################
-FROM python-install as cuda-base
+FROM python-install AS cuda-base
 
 RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
         https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
@@ -51,88 +50,30 @@ RUN microdnf install -y --nodocs \
     ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/
 
 
-
 ## Python cuda base #################################################################
 FROM cuda-base AS python-cuda-base
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # install cuda and common dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
     uv pip install \
         -r requirements-cuda.txt
 
 
-## Development #################################################################
-FROM python-cuda-base AS dev
-
-# install build and runtime dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
-    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
-    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
-    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
-    uv pip install \
-        -r requirements-cuda.txt \
-        -r requirements-dev.txt
-
-## Builder #####################################################################
-FROM dev AS build
-
-# install build dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
-    uv pip install -r requirements-build.txt
-
-# install compiler cache to speed up compilation leveraging local or remote caching
-# git is required for the cutlass kernels
-RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all
-
-COPY . .
-
-ARG TORCH_CUDA_ARCH_LIST
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-ARG vllm_fa_cmake_gpu_arches
-ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
-
-# max jobs used by Ninja to build extensions
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# number of threads used by nvcc
-ARG nvcc_threads=8
-ENV NVCC_THREADS=$nvcc_threads
-# make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
-
-# Make sure the cuda environment is in the PATH
-ENV PATH=/usr/local/cuda/bin:$PATH
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=.git,target=/workspace/.git \
-    env CFLAGS="-march=haswell" \
-        CXXFLAGS="$CFLAGS $CXXFLAGS" \
-        CMAKE_BUILD_TYPE=Release \
-        python3 setup.py bdist_wheel --dist-dir=dist
 
 #################### libsodium Build IMAGE ####################
-FROM base as libsodium-builder
+FROM base AS libsodium-builder
 
 RUN microdnf install -y --nodocs gcc gzip \
     && microdnf clean all
 
 WORKDIR /usr/src/libsodium
 
-ARG LIBSODIUM_VERSION=1.0.20
+ARG LIBSODIUM_VERSION
 RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
     && tar -xzvf libsodium*.tar.gz \
     && rm -f libsodium*.tar.gz \
@@ -156,25 +97,29 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv
 ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
 
 # Triton needs a CC compiler
+
 RUN microdnf install -y --nodocs gcc \
     rsync \
     && microdnf clean all
 
-# install vllm wheel first, so that torch etc will be installed
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
-    --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose
 
 # Install libsodium for Tensorizer encryption
 RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
-    cd /usr/src/libsodium \
-    && make install
+    make -C /usr/src/libsodium install
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install \
-        "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"
+COPY LICENSE /licenses/vllm.md
+COPY examples/*.jinja /app/data/template/
+
+# install vllm by running the payload script and then install flashinfer
+
+ARG WHEEL_RELEASE
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=payload,target=/workspace/payload \
+    --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
+        WHEEL_RELEASE=${WHEEL_RELEASE} \
+        ./payload/run.sh
 
 ENV HF_HUB_OFFLINE=1 \
     HOME=/home/vllm \
@@ -199,25 +144,30 @@ ENV HF_HUB_OFFLINE=1 \
 RUN umask 002 && \
     useradd --uid 2000 --gid 0 vllm && \
     mkdir -p /home/vllm && \
-    chmod g+rwx /home/vllm /usr/src /workspace
-
-COPY LICENSE /licenses/vllm.md
-COPY examples/*.jinja /app/data/template/
+    chmod g+rwx /home/vllm
 
 USER 2000
 WORKDIR /home/vllm
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 
 
-FROM vllm-openai as vllm-grpc-adapter
+## TGIS Adapter layer #####################################################################
+FROM vllm-openai AS vllm-grpc-adapter
 
 USER root
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
-    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3
+ARG WHEEL_RELEASE
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=payload,target=/workspace/payload \
+    --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
+    cd /workspace && \
+    source ${VIRTUAL_ENV}/bin/activate && \
+    env HOME=/root \
+        BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
+        WHEEL_RELEASE=${WHEEL_RELEASE} \
+        ./payload/run.sh
+
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \

diff --git a/argfiles/argfile.ubi b/argfiles/argfile.ubi
@@ -0,0 +1,5 @@
+BASE_UBI_IMAGE_TAG=9.5-1739420147
+PYTHON_VERSION=3.11
+LIBSODIUM_VERSION=1.0.20
+WHEEL_RELEASE=2.20.55+vllm-cuda-ubi9-x86_64
+# can view releases at https://gitlab.com/redhat/rhel-ai/rhoai/pipeline/-/releases
diff --git a/payload/run.sh b/payload/run.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Script assumes python venv is already properly configured
+# required env vars:
+# $BOT_PAT
+# $WHEEL_RELEASE
+# $WHEEL_BASEURL
+set -ex
+
+cat <<EOF > ${HOME}/.netrc
+machine gitlab.com
+login rhel-ai-wheels-prefetch-token-rhoai 
+password $BOT_PAT
+EOF
+
+trap "rm -rf ${HOME}/.netrc release release.tar.gz" EXIT
+
+# WHEEL_RELEASE="2.20.55+vllm-cuda-ubi9-x86_64"
+
+# Gitlab project ID, etc should be static 
+WHEEL_RELEASE_ARTIFACTS="https://gitlab.com/api/v4/projects/68045055/packages/generic/rhelai-wheels/${WHEEL_RELEASE}/wheels-${WHEEL_RELEASE}.tar.gz"
+
+
+# NOTE - ensure that flashinfer is included in wheel thing
+
+curl --netrc -o release.tar.gz ${WHEEL_RELEASE_ARTIFACTS} 
+tar zxvf release.tar.gz 
+./release/install_wheels.sh
+