red-hat-data-services · dtrifiro · Mar 27, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -1,9 +1,12 @@
+## Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.5-1739420147
+ARG PYTHON_VERSION=3.12
 
-ARG BASE_UBI_IMAGE_TAG
-ARG PYTHON_VERSION
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
 
 ## Base Layer ##################################################################
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
 ARG PYTHON_VERSION
 ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN microdnf -y update && microdnf install -y --nodocs \
@@ -16,14 +19,13 @@ ENV LANG=C.UTF-8 \
     LC_ALL=C.UTF-8
 
 # Some utils for dev purposes - tar required for kubectl cp
-
 RUN microdnf install -y --nodocs \
-        which procps findutils tar vim git \
+        which procps findutils tar vim git\
     && microdnf clean all
 
 
 ## Python Installer ############################################################
-FROM base AS python-install
+FROM base as python-install
 ARG PYTHON_VERSION
 
 ENV VIRTUAL_ENV=/opt/vllm
@@ -37,7 +39,7 @@ RUN microdnf install -y --nodocs \
 
 
 ## CUDA Base ###################################################################
-FROM python-install AS cuda-base
+FROM python-install as cuda-base
 
 RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
         https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
@@ -51,6 +53,7 @@ RUN microdnf install -y --nodocs \
     ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/
 
 
+
 ## Python cuda base #################################################################
 FROM cuda-base AS python-cuda-base
 
@@ -65,9 +68,65 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         -r requirements-cuda.txt
 
 
+## Development #################################################################
+FROM python-cuda-base AS dev
+
+# install build and runtime dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
+    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
+    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
+    uv pip install \
+        -r requirements-cuda.txt \
+        -r requirements-dev.txt
+
+## Builder #####################################################################
+FROM dev AS build
+
+# install build dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
+    uv pip install -r requirements-build.txt
+
+# install compiler cache to speed up compilation leveraging local or remote caching
+# git is required for the cutlass kernels
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all
+
+COPY . .
+
+ARG TORCH_CUDA_ARCH_LIST
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+ARG vllm_fa_cmake_gpu_arches
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+# Make sure the cuda environment is in the PATH
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=.git,target=/workspace/.git \
+    env CFLAGS="-march=haswell" \
+        CXXFLAGS="$CFLAGS $CXXFLAGS" \
+        CMAKE_BUILD_TYPE=Release \
+        python3 setup.py bdist_wheel --dist-dir=dist
 
 #################### libsodium Build IMAGE ####################
-FROM base AS libsodium-builder
+FROM base as libsodium-builder
 
 RUN microdnf install -y --nodocs gcc gzip \
     && microdnf clean all
@@ -98,32 +157,24 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv
 ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
 
 # Triton needs a CC compiler
-
 RUN microdnf install -y --nodocs gcc \
     rsync \
     && microdnf clean all
 
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install "$(echo dist/*.whl)[audio,video,tensorizer]" --verbose
 
 # Install libsodium for Tensorizer encryption
 RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
     make -C /usr/src/libsodium install
 
-COPY LICENSE /licenses/vllm.md
-COPY examples/*.jinja /app/data/template/
-
-# install vllm by running the payload script and then install flashinfer
-
-ARG VLLM_WHEEL_VERSION
-ARG VLLM_WHEEL_INDEX
-ARG FLASHINFER_VERSION
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=payload,target=/workspace/payload \
-    --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
-        env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
-            VLLM_WHEEL_VERSION=${VLLM_VERSION} \
-            VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
-        ./payload/run.sh && \
-        uv pip install "${FLASHINFER_VERSION}" 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install \
+        "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"
 
 ENV HF_HUB_OFFLINE=1 \
     HOME=/home/vllm \
@@ -148,32 +199,26 @@ ENV HF_HUB_OFFLINE=1 \
 RUN umask 002 && \
     useradd --uid 2000 --gid 0 vllm && \
     mkdir -p /home/vllm && \
+    chown vllm:vllm /home/vllm && \
     chmod g+rwx /home/vllm
 
+COPY LICENSE /licenses/vllm.md
+COPY examples/*.jinja /app/data/template/
+
 USER 2000
 WORKDIR /home/vllm
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 
 
-## TGIS Adapter layer #####################################################################
-FROM vllm-openai AS vllm-grpc-adapter
+FROM vllm-openai as vllm-grpc-adapter
 
 USER root
 
-ARG VLLM_TGIS_ADAPTER_VERSION
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=payload,target=/workspace/payload \
-    --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
-    cd /workspace && \
-    ls && \
-    env HOME=/root \
-        BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
-        VLLM_WHEEL_VERSION=${VLLM_VERSION} \
-        VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \
-        VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
-        ./payload/run.sh
-
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \

diff --git a/argfile.konflux b/argfile.konflux
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
@@ -4,6 +4,8 @@
 
 #include <cmath>
 
+#include "core/math.hpp"
+
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 
@@ -31,6 +33,69 @@ __global__ void act_and_mul_kernel(
   }
 }
 
+// NOTE: temporary vectorized version.
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void act_and_mul_kernel_vectorized(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  const int32_t blocks_per_token = gridDim.y;
+
+  const int32_t elems_per_128bit_load = (128 / 8) / sizeof(scalar_t);
+
+  const int32_t tgt_elems_per_block = ceil_div(d, blocks_per_token);
+  const int32_t elems_per_block =
+      next_multiple_of(elems_per_128bit_load, tgt_elems_per_block);
+  const int64_t block_start = blockIdx.y * int64_t(elems_per_block);
+  int64_t block_end = block_start + elems_per_block;
+  block_end = block_end > d ? d : block_end;
+
+  const scalar_t* __restrict__ x_ptr = input + token_idx * 2 * d;
+  const scalar_t* __restrict__ y_ptr = input + token_idx * 2 * d + d;
+  scalar_t* __restrict__ out_ptr = out + token_idx * d;
+
+  // 128-bit vectorized code
+  const int32_t vec_loop_end =
+      prev_multiple_of(elems_per_128bit_load, block_end);
+  const int32_t vec_end_idx = vec_loop_end / elems_per_128bit_load;
+  const int32_t vec_start_idx = block_start / elems_per_128bit_load;
+
+  const int4* __restrict__ x_128bit_ptr = reinterpret_cast<const int4*>(x_ptr);
+  const int4* __restrict__ y_128bit_ptr = reinterpret_cast<const int4*>(y_ptr);
+  int4* __restrict__ out_128bit_ptr = reinterpret_cast<int4*>(out_ptr);
+
+#pragma unroll
+  for (int32_t vec_idx = vec_start_idx + threadIdx.x; vec_idx < vec_end_idx;
+       vec_idx += blockDim.x) {
+    const int4 x_128bit = VLLM_LDG(&x_128bit_ptr[vec_idx]);
+    const int4 y_128bit = VLLM_LDG(&y_128bit_ptr[vec_idx]);
+    using scalar_128bit_vec_t = std::array<scalar_t, elems_per_128bit_load>;
+
+    scalar_128bit_vec_t out_vec;
+    const auto x_vec = reinterpret_cast<scalar_128bit_vec_t const&>(x_128bit);
+    const auto y_vec = reinterpret_cast<scalar_128bit_vec_t const&>(y_128bit);
+
+#pragma unroll
+    for (int i = 0; i < elems_per_128bit_load; i++) {
+      out_vec[i] = ACT_FN(x_vec[i]) * y_vec[i];
+    }
+
+    out_128bit_ptr[vec_idx] = reinterpret_cast<const int4&>(out_vec);
+  }
+
+  // Scalar cleanup code
+  if (block_end > vec_loop_end) {
+    for (int64_t idx = vec_loop_end + threadIdx.x; idx < block_end;
+         idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x) * y;
+    }
+  }
+}
+
 template <typename T>
 __device__ __forceinline__ T silu_kernel(const T& x) {
   // x * sigmoid(x)
@@ -79,10 +144,26 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
                                          input.data_ptr<scalar_t>(), d); \
       });
 
+// Launch activation and gating kernel.
+// Vectorized Version
+#define LAUNCH_ACTIVATION_GATE_KERNEL_VECTORIZED(KERNEL)                 \
+  int d = input.size(-1) / 2;                                            \
+  int64_t num_tokens = input.numel() / input.size(-1);                   \
+  dim3 grid(num_tokens, num_tokens > 16 ? num_tokens > 32 ? 1 : 2 : 4);  \
+  dim3 block(std::min(d, 512));                                          \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
+  VLLM_DISPATCH_FLOATING_TYPES(                                          \
+      input.scalar_type(), "act_and_mul_kernel_vectorized", [&] {        \
+        vllm::act_and_mul_kernel_vectorized<scalar_t, KERNEL<scalar_t>>  \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
+                                         input.data_ptr<scalar_t>(), d); \
+      });
+
 void silu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
+  LAUNCH_ACTIVATION_GATE_KERNEL_VECTORIZED(vllm::silu_kernel);
 }
 
 void mul_and_silu(torch::Tensor& out,    // [..., d]

diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
@@ -11,4 +11,16 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
 template <typename T>
 inline constexpr std::enable_if_t<std::is_integral_v<T>, T> ceil_div(T a, T b) {
   return (a + b - 1) / b;
-}
+}
+
+// Compute the next multiple of a that is greater than or equal to b
+template <typename A, typename B>
+static inline constexpr auto next_multiple_of(A a, B b) {
+  return ceil_div(b, a) * a;
+}
+
+// Compute the largest multiple of a that is less than or equal to b
+template <typename A, typename B>
+static inline constexpr auto prev_multiple_of(A a, B b) {
+  return (b / a) * a;
+}