Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions .tekton/vllm-cuda-v2-19-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ spec:
value: Dockerfile.ubi
- name: path-context
value: .
- name: additional-build-secret
value: rhel-ai-private-index-auth
- name: build-args-file
value: argfile.konflux
taskRunSpecs:
- pipelineTaskName: ecosystem-cert-preflight-checks
computeResources:
Expand Down Expand Up @@ -298,8 +294,6 @@ spec:
- $(params.build-platforms)
name: build-images
params:
- name: ADDITIONAL_SECRET
value: $(params.additional-build-secret)
- name: IMAGE
value: $(params.output-image)
- name: DOCKERFILE
Expand Down
138 changes: 91 additions & 47 deletions Dockerfile.ubi
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
## Global Args #################################################################
ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
ARG PYTHON_VERSION=3.12

ARG BASE_UBI_IMAGE_TAG
ARG PYTHON_VERSION
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'

## Base Layer ##################################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
ARG PYTHON_VERSION
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN microdnf -y update && microdnf install -y --nodocs \
Expand All @@ -16,28 +19,25 @@ ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8

# Some utils for dev purposes - tar required for kubectl cp

RUN microdnf install -y --nodocs \
which procps findutils tar vim git \
which procps findutils tar vim git\
&& microdnf clean all


## Python Installer ############################################################
FROM base AS python-install
FROM base as python-install
ARG PYTHON_VERSION

ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN microdnf install -y --nodocs \
python${PYTHON_VERSION}-devel && \
python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
pip install --no-cache -U pip wheel uv && \
microdnf clean all
python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all


## CUDA Base ###################################################################
FROM python-install AS cuda-base
FROM python-install as cuda-base

RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
Expand All @@ -51,30 +51,88 @@ RUN microdnf install -y --nodocs \
ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/



## Python cuda base #################################################################
FROM cuda-base AS python-cuda-base

ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# install cuda and common dependencies
RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
uv pip install \
-r requirements-cuda.txt


## Development #################################################################
FROM python-cuda-base AS dev

# install build and runtime dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
--mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
--mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
--mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
uv pip install \
-r requirements-cuda.txt \
-r requirements-dev.txt

## Builder #####################################################################
FROM dev AS build

# install build dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
uv pip install -r requirements-build.txt

# install compiler cache to speed up compilation leveraging local or remote caching
# git is required for the cutlass kernels
RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all

COPY . .

ARG TORCH_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
ARG vllm_fa_cmake_gpu_arches
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}

# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

# Make sure the cuda environment is in the PATH
ENV PATH=/usr/local/cuda/bin:$PATH

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=.git,target=/workspace/.git \
env CFLAGS="-march=haswell" \
CXXFLAGS="$CFLAGS $CXXFLAGS" \
CMAKE_BUILD_TYPE=Release \
python3 setup.py bdist_wheel --dist-dir=dist

#################### libsodium Build IMAGE ####################
FROM base AS libsodium-builder
FROM base as libsodium-builder

RUN microdnf install -y --nodocs gcc gzip \
&& microdnf clean all

WORKDIR /usr/src/libsodium

ARG LIBSODIUM_VERSION
ARG LIBSODIUM_VERSION=1.0.20
RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
&& tar -xzvf libsodium*.tar.gz \
&& rm -f libsodium*.tar.gz \
Expand All @@ -98,32 +156,25 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv
ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"

# Triton needs a CC compiler

RUN microdnf install -y --nodocs gcc \
rsync \
&& microdnf clean all

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
uv pip install "$(echo dist/*.whl)[audio,video,tensorizer]" --verbose

# Install libsodium for Tensorizer encryption
RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
make -C /usr/src/libsodium install
cd /usr/src/libsodium \
&& make install

COPY LICENSE /licenses/vllm.md
COPY examples/*.jinja /app/data/template/

# install vllm by running the payload script and then install flashinfer

ARG VLLM_WHEEL_VERSION
ARG VLLM_WHEEL_INDEX
ARG FLASHINFER_VERSION
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=payload,target=/workspace/payload \
--mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
VLLM_WHEEL_VERSION=${VLLM_VERSION} \
VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
./payload/run.sh && \
uv pip install "${FLASHINFER_VERSION}"
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
uv pip install \
"https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"

ENV HF_HUB_OFFLINE=1 \
HOME=/home/vllm \
Expand All @@ -148,32 +199,25 @@ ENV HF_HUB_OFFLINE=1 \
RUN umask 002 && \
useradd --uid 2000 --gid 0 vllm && \
mkdir -p /home/vllm && \
chmod g+rwx /home/vllm
chmod g+rwx /home/vllm /usr/src /workspace

COPY LICENSE /licenses/vllm.md
COPY examples/*.jinja /app/data/template/

USER 2000
WORKDIR /home/vllm

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]


## TGIS Adapter layer #####################################################################
FROM vllm-openai AS vllm-grpc-adapter
FROM vllm-openai as vllm-grpc-adapter

USER root

ARG VLLM_TGIS_ADAPTER_VERSION
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=payload,target=/workspace/payload \
--mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
cd /workspace && \
ls && \
env HOME=/root \
BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
VLLM_WHEEL_VERSION=${VLLM_VERSION} \
VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \
VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
./payload/run.sh

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[audio,video,tensorizer]" vllm-tgis-adapter==0.6.3

ENV GRPC_PORT=8033 \
PORT=8000 \
Expand Down
7 changes: 0 additions & 7 deletions argfile.konflux

This file was deleted.

34 changes: 0 additions & 34 deletions payload/run.sh

This file was deleted.