Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 85 additions & 40 deletions Dockerfile.ubi
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
## Global Args #################################################################
ARG BASE_UBI_IMAGE_TAG=9.5-1739420147
ARG PYTHON_VERSION=3.12

ARG BASE_UBI_IMAGE_TAG
ARG PYTHON_VERSION
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'

## Base Layer ##################################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
ARG PYTHON_VERSION
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN microdnf -y update && microdnf install -y --nodocs \
Expand All @@ -16,14 +19,13 @@ ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8

# Some utils for dev purposes - tar required for kubectl cp

RUN microdnf install -y --nodocs \
which procps findutils tar vim git \
which procps findutils tar vim git\
&& microdnf clean all


## Python Installer ############################################################
FROM base AS python-install
FROM base as python-install
ARG PYTHON_VERSION

ENV VIRTUAL_ENV=/opt/vllm
Expand All @@ -37,7 +39,7 @@ RUN microdnf install -y --nodocs \


## CUDA Base ###################################################################
FROM python-install AS cuda-base
FROM python-install as cuda-base

RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
Expand All @@ -51,6 +53,7 @@ RUN microdnf install -y --nodocs \
ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/



## Python cuda base #################################################################
FROM cuda-base AS python-cuda-base

Expand All @@ -65,9 +68,65 @@ RUN --mount=type=cache,target=/root/.cache/uv \
-r requirements-cuda.txt


## Development #################################################################
FROM python-cuda-base AS dev

# install build and runtime dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
--mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
--mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
--mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
uv pip install \
-r requirements-cuda.txt \
-r requirements-dev.txt

## Builder #####################################################################
FROM dev AS build

# install build dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
uv pip install -r requirements-build.txt

# install compiler cache to speed up compilation leveraging local or remote caching
# git is required for the cutlass kernels
RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all

COPY . .

ARG TORCH_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
ARG vllm_fa_cmake_gpu_arches
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}

# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

# Make sure the cuda environment is in the PATH
ENV PATH=/usr/local/cuda/bin:$PATH

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=.git,target=/workspace/.git \
env CFLAGS="-march=haswell" \
CXXFLAGS="$CFLAGS $CXXFLAGS" \
CMAKE_BUILD_TYPE=Release \
python3 setup.py bdist_wheel --dist-dir=dist

#################### libsodium Build IMAGE ####################
FROM base AS libsodium-builder
FROM base as libsodium-builder

RUN microdnf install -y --nodocs gcc gzip \
&& microdnf clean all
Expand Down Expand Up @@ -98,32 +157,24 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv
ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"

# Triton needs a CC compiler

RUN microdnf install -y --nodocs gcc \
rsync \
&& microdnf clean all

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
uv pip install "$(echo dist/*.whl)[audio,video,tensorizer]" --verbose

# Install libsodium for Tensorizer encryption
RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
make -C /usr/src/libsodium install

COPY LICENSE /licenses/vllm.md
COPY examples/*.jinja /app/data/template/

# install vllm by running the payload script and then install flashinfer

ARG VLLM_WHEEL_VERSION
ARG VLLM_WHEEL_INDEX
ARG FLASHINFER_VERSION
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=payload,target=/workspace/payload \
--mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
VLLM_WHEEL_VERSION=${VLLM_VERSION} \
VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
./payload/run.sh && \
uv pip install "${FLASHINFER_VERSION}"
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
uv pip install \
"https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"

ENV HF_HUB_OFFLINE=1 \
HOME=/home/vllm \
Expand All @@ -148,32 +199,26 @@ ENV HF_HUB_OFFLINE=1 \
RUN umask 002 && \
useradd --uid 2000 --gid 0 vllm && \
mkdir -p /home/vllm && \
chown vllm:vllm /home/vllm && \
chmod g+rwx /home/vllm

COPY LICENSE /licenses/vllm.md
COPY examples/*.jinja /app/data/template/

USER 2000
WORKDIR /home/vllm

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]


## TGIS Adapter layer #####################################################################
FROM vllm-openai AS vllm-grpc-adapter
FROM vllm-openai as vllm-grpc-adapter

USER root

ARG VLLM_TGIS_ADAPTER_VERSION
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=payload,target=/workspace/payload \
--mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
cd /workspace && \
ls && \
env HOME=/root \
BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
VLLM_WHEEL_VERSION=${VLLM_VERSION} \
VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \
VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
./payload/run.sh

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3

ENV GRPC_PORT=8033 \
PORT=8000 \
Expand Down
7 changes: 0 additions & 7 deletions argfile.konflux

This file was deleted.

83 changes: 82 additions & 1 deletion csrc/activation_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#include <cmath>

#include "core/math.hpp"

#include "cuda_compat.h"
#include "dispatch_utils.h"

Expand Down Expand Up @@ -31,6 +33,69 @@ __global__ void act_and_mul_kernel(
}
}

// NOTE: temporary vectorized version.

template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
__global__ void act_and_mul_kernel_vectorized(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., 2, d]
const int d) {
const int64_t token_idx = blockIdx.x;
const int32_t blocks_per_token = gridDim.y;

const int32_t elems_per_128bit_load = (128 / 8) / sizeof(scalar_t);

const int32_t tgt_elems_per_block = ceil_div(d, blocks_per_token);
const int32_t elems_per_block =
next_multiple_of(elems_per_128bit_load, tgt_elems_per_block);
const int64_t block_start = blockIdx.y * int64_t(elems_per_block);
int64_t block_end = block_start + elems_per_block;
block_end = block_end > d ? d : block_end;

const scalar_t* __restrict__ x_ptr = input + token_idx * 2 * d;
const scalar_t* __restrict__ y_ptr = input + token_idx * 2 * d + d;
scalar_t* __restrict__ out_ptr = out + token_idx * d;

// 128-bit vectorized code
const int32_t vec_loop_end =
prev_multiple_of(elems_per_128bit_load, block_end);
const int32_t vec_end_idx = vec_loop_end / elems_per_128bit_load;
const int32_t vec_start_idx = block_start / elems_per_128bit_load;

const int4* __restrict__ x_128bit_ptr = reinterpret_cast<const int4*>(x_ptr);
const int4* __restrict__ y_128bit_ptr = reinterpret_cast<const int4*>(y_ptr);
int4* __restrict__ out_128bit_ptr = reinterpret_cast<int4*>(out_ptr);

#pragma unroll
for (int32_t vec_idx = vec_start_idx + threadIdx.x; vec_idx < vec_end_idx;
vec_idx += blockDim.x) {
const int4 x_128bit = VLLM_LDG(&x_128bit_ptr[vec_idx]);
const int4 y_128bit = VLLM_LDG(&y_128bit_ptr[vec_idx]);
using scalar_128bit_vec_t = std::array<scalar_t, elems_per_128bit_load>;

scalar_128bit_vec_t out_vec;
const auto x_vec = reinterpret_cast<scalar_128bit_vec_t const&>(x_128bit);
const auto y_vec = reinterpret_cast<scalar_128bit_vec_t const&>(y_128bit);

#pragma unroll
for (int i = 0; i < elems_per_128bit_load; i++) {
out_vec[i] = ACT_FN(x_vec[i]) * y_vec[i];
}

out_128bit_ptr[vec_idx] = reinterpret_cast<const int4&>(out_vec);
}

// Scalar cleanup code
if (block_end > vec_loop_end) {
for (int64_t idx = vec_loop_end + threadIdx.x; idx < block_end;
idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&x_ptr[idx]);
const scalar_t y = VLLM_LDG(&y_ptr[idx]);
out_ptr[idx] = ACT_FN(x) * y;
}
}
}

template <typename T>
__device__ __forceinline__ T silu_kernel(const T& x) {
// x * sigmoid(x)
Expand Down Expand Up @@ -79,10 +144,26 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
input.data_ptr<scalar_t>(), d); \
});

// Launch activation and gating kernel.
// Vectorized Version
#define LAUNCH_ACTIVATION_GATE_KERNEL_VECTORIZED(KERNEL) \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
dim3 grid(num_tokens, num_tokens > 16 ? num_tokens > 32 ? 1 : 2 : 4); \
dim3 block(std::min(d, 512)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), "act_and_mul_kernel_vectorized", [&] { \
vllm::act_and_mul_kernel_vectorized<scalar_t, KERNEL<scalar_t>> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d); \
});

void silu_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
LAUNCH_ACTIVATION_GATE_KERNEL_VECTORIZED(vllm::silu_kernel);
}

void mul_and_silu(torch::Tensor& out, // [..., d]
Expand Down
14 changes: 13 additions & 1 deletion csrc/core/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,16 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
template <typename T>
inline constexpr std::enable_if_t<std::is_integral_v<T>, T> ceil_div(T a, T b) {
return (a + b - 1) / b;
}
}

// Compute the next multiple of a that is greater than or equal to b
template <typename A, typename B>
static inline constexpr auto next_multiple_of(A a, B b) {
return ceil_div(b, a) * a;
}

// Compute the largest multiple of a that is less than or equal to b
template <typename A, typename B>
static inline constexpr auto prev_multiple_of(A a, B b) {
return (b / a) * a;
}
Loading