diff --git a/README.md b/README.md index e9e3cb2b4..c7659008f 100644 --- a/README.md +++ b/README.md @@ -247,13 +247,36 @@ curl http://localhost:8080/metrics Check [METRICS.md](./METRICS.md) for more details. -## Kubernetes +## Kubernetes Deployment -Experimental support for running in Kubernetes is available -in the form of [a Helm chart and static YAML](charts/docker-model-runner/README.md). +Docker Model Runner provides production-ready deployment guides for running vLLM on Kubernetes (via llm-d) with advanced features: -If you are interested in a specific Kubernetes use-case, please start a -discussion on the issue tracker. +- **Intelligent Inference Scheduling** - Load balancing optimized for LLM workloads +- **Prefill/Decode Disaggregation** - Improved latency and throughput +- **Wide Expert-Parallelism** - Support for large Mixture-of-Experts models +- **Multi-GPU Support** - NVIDIA, AMD, Google TPU, and Intel XPU + +### Quick Start + +```bash +# List available deployment configurations +docker model k8s list-configs + +# View deployment guides +docker model k8s guide + +# Deploy vLLM with intelligent scheduling +docker model k8s deploy --config inference-scheduling --namespace vllm-inference +``` + +### Prerequisites + +- Kubernetes cluster (version 1.29+) +- kubectl configured to access your cluster +- helm (version 3.x+) +- GPU-enabled nodes + +For detailed deployment instructions, see the [k8s directory](./k8s/README.md). ## Community diff --git a/cmd/cli/README.md b/cmd/cli/README.md index face3e6b8..84359cc8d 100644 --- a/cmd/cli/README.md +++ b/cmd/cli/README.md @@ -47,6 +47,13 @@ Run `./model --help` to see all commands and options. - `model pull MODEL` — Pull a model - `model push MODEL` — Push a model - `model rm MODEL` — Remove a model +- `model k8s` — Deploy vLLM on Kubernetes (via llm-d) + +### Kubernetes Deployment Commands + +- `model k8s list-configs` — List available deployment configurations +- `model k8s deploy --config ` — Deploy vLLM with a specific configuration +- `model k8s guide` — Display detailed deployment guides ## Example: Interactive Chat ```bash diff --git a/cmd/cli/commands/k8s.go b/cmd/cli/commands/k8s.go new file mode 100644 index 000000000..85f67f66f --- /dev/null +++ b/cmd/cli/commands/k8s.go @@ -0,0 +1,238 @@ +package commands + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + + "github.com/spf13/cobra" +) + +func newK8sCmd() *cobra.Command { + c := &cobra.Command{ + Use: "k8s", + Short: "Deploy vLLM on Kubernetes", + Long: `Deploy vLLM inference servers on Kubernetes with intelligent load balancing. + +This command provides deployment guides and tools for running vLLM on Kubernetes +with production-ready configurations including: +- Intelligent inference scheduling with load balancing +- Prefill/Decode disaggregation for better performance +- Wide Expert-Parallelism for large MoE models +- Support for NVIDIA GPUs, AMD GPUs, Google TPUs, and Intel XPUs`, + } + + c.AddCommand( + newK8sDeployCmd(), + newK8sListConfigsCmd(), + newK8sGuideCmd(), + ) + + return c +} + +func newK8sDeployCmd() *cobra.Command { + var namespace string + var config string + var model string + var replicas int + + c := &cobra.Command{ + Use: "deploy", + Short: "Deploy vLLM on Kubernetes", + Long: "Deploy vLLM inference server on Kubernetes with the specified configuration", + RunE: func(cmd *cobra.Command, args []string) error { + if config == "" { + return fmt.Errorf("--config is required. Use 'docker model k8s list-configs' to see available configurations") + } + + // Get the path to the k8s resources + resourcesPath, err := getK8sResourcesPath() + if err != nil { + return err + } + + configPath := filepath.Join(resourcesPath, "configs", config) + if _, err := os.Stat(configPath); os.IsNotExist(err) { + return fmt.Errorf("configuration '%s' not found. Use 'docker model k8s list-configs' to see available configurations", config) + } + + cmd.Printf("Deploying vLLM with configuration: %s\n", config) + cmd.Printf("Namespace: %s\n", namespace) + if model != "" { + cmd.Printf("Model: %s\n", model) + } + cmd.Printf("Replicas: %d\n", replicas) + + // Check if kubectl is available + if _, err := exec.LookPath("kubectl"); err != nil { + return fmt.Errorf("kubectl not found in PATH. Please install kubectl to deploy to Kubernetes") + } + + // Check if helm is available for more complex deployments + if _, err := exec.LookPath("helm"); err != nil { + cmd.PrintErrln("Warning: helm not found in PATH. Some deployment options may not be available.") + } + + cmd.Println("\nDeployment instructions:") + cmd.Printf("1. Ensure your kubectl context is set to the correct cluster\n") + cmd.Printf("2. Create namespace if it doesn't exist: kubectl create namespace %s\n", namespace) + cmd.Printf("3. Apply the configuration: kubectl apply -f %s -n %s\n", configPath, namespace) + cmd.Printf("\nFor detailed deployment guides, run: docker model k8s guide\n") + + return nil + }, + } + + c.Flags().StringVarP(&namespace, "namespace", "n", "vllm-inference", "Kubernetes namespace") + c.Flags().StringVarP(&config, "config", "c", "", "Configuration to deploy (required)") + c.Flags().StringVarP(&model, "model", "m", "", "Model to deploy") + c.Flags().IntVarP(&replicas, "replicas", "r", 1, "Number of replicas") + _ = c.MarkFlagRequired("config") + + return c +} + +func newK8sListConfigsCmd() *cobra.Command { + c := &cobra.Command{ + Use: "list-configs", + Short: "List available Kubernetes deployment configurations", + Long: "List all available pre-configured deployment options for vLLM on Kubernetes", + RunE: func(cmd *cobra.Command, args []string) error { + cmd.Println("Available Kubernetes deployment configurations:") + cmd.Println() + cmd.Println("1. inference-scheduling") + cmd.Println(" Deploy vLLM with intelligent inference scheduling for optimal load balancing") + cmd.Println() + cmd.Println("2. pd-disaggregation") + cmd.Println(" Prefill/Decode disaggregation for improved latency and throughput") + cmd.Println() + cmd.Println("3. wide-ep") + cmd.Println(" Wide Expert-Parallelism for large Mixture-of-Experts models") + cmd.Println() + cmd.Println("4. simulated-accelerators") + cmd.Println(" Deploy with simulated accelerators for testing") + cmd.Println() + cmd.Println("Use 'docker model k8s deploy --config ' to deploy a configuration") + cmd.Println("Use 'docker model k8s guide' to view detailed deployment guides") + return nil + }, + } + return c +} + +func newK8sGuideCmd() *cobra.Command { + c := &cobra.Command{ + Use: "guide", + Short: "Display deployment guides", + Long: "Display detailed guides for deploying vLLM on Kubernetes", + RunE: func(cmd *cobra.Command, args []string) error { + resourcesPath, err := getK8sResourcesPath() + if err != nil { + return err + } + + guidePath := filepath.Join(resourcesPath, "guides", "README.md") + content, err := os.ReadFile(guidePath) + if err != nil { + // Fallback to inline guide if file doesn't exist + cmd.Println(getInlineGuide()) + return nil + } + + cmd.Println(string(content)) + return nil + }, + } + return c +} + +func getK8sResourcesPath() (string, error) { + // Try to find the k8s resources directory + // First check if it's in the current directory structure + candidates := []string{ + "./k8s", + "./deploy/k8s", + filepath.Join(os.Getenv("HOME"), ".docker", "model", "k8s"), + } + + for _, candidate := range candidates { + if _, err := os.Stat(candidate); err == nil { + return candidate, nil + } + } + + // If not found, we'll create a minimal structure + homePath := filepath.Join(os.Getenv("HOME"), ".docker", "model", "k8s") + if err := os.MkdirAll(homePath, 0755); err != nil { + return "", fmt.Errorf("failed to create k8s resources directory: %w", err) + } + + return homePath, nil +} + +func getInlineGuide() string { + return `# vLLM Kubernetes Deployment Guide + +## Overview + +This guide helps you deploy vLLM inference servers on Kubernetes with production-ready configurations. + +## Prerequisites + +1. **Kubernetes Cluster**: A running Kubernetes cluster (version 1.29 or later) +2. **kubectl**: Kubernetes command-line tool configured to access your cluster +3. **helm**: Helm package manager (version 3.x or later) +4. **GPU Support**: Nodes with supported GPUs (NVIDIA, AMD, Google TPU, or Intel XPU) + +## Quick Start + +### 1. Choose a Deployment Configuration + +Available configurations: +- **inference-scheduling**: Intelligent load balancing for optimal throughput +- **pd-disaggregation**: Separate prefill and decode phases for better latency +- **wide-ep**: Expert parallelism for large MoE models + +### 2. Deploy + +` + "```bash" + ` +# Set your namespace +export NAMESPACE=vllm-inference + +# Create namespace +kubectl create namespace $NAMESPACE + +# Deploy with a configuration +docker model k8s deploy --config inference-scheduling --namespace $NAMESPACE +` + "```" + ` + +### 3. Verify Deployment + +` + "```bash" + ` +# Check pods +kubectl get pods -n $NAMESPACE + +# Check services +kubectl get services -n $NAMESPACE +` + "```" + ` + +## Hardware Support + +vLLM on Kubernetes supports: +- NVIDIA GPUs (A100, H100, L4, etc.) +- AMD GPUs (MI250, MI300, etc.) +- Google TPUs (v5e and newer) +- Intel XPUs (Ponte Vecchio and newer) + +## Next Steps + +1. Configure your model serving parameters +2. Set up monitoring and observability +3. Configure autoscaling based on your workload +4. Implement request routing and load balancing + +For more detailed information, visit the vLLM documentation at https://docs.vllm.ai +` +} diff --git a/cmd/cli/commands/root.go b/cmd/cli/commands/root.go index 0aa5b1a79..c54d0ead5 100644 --- a/cmd/cli/commands/root.go +++ b/cmd/cli/commands/root.go @@ -115,6 +115,7 @@ func NewRootCmd(cli *command.DockerCli) *cobra.Command { newDFCmd(), newUnloadCmd(), newRequestsCmd(), + newK8sCmd(), ) return rootCmd } diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 000000000..9269305fa --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,58 @@ +# vLLM Kubernetes Deployment Resources + +This directory contains deployment guides, manifests, and tools for deploying vLLM inference servers on Kubernetes. + +## Structure + +- `guides/` - Deployment guides and examples for different configurations +- `docker/` - Dockerfiles for building vLLM container images +- `scripts/` - Helper scripts for deployment and management + +## Usage + +Use the Docker Model CLI to deploy vLLM on Kubernetes: + +```bash +# List available deployment configurations +docker model k8s list-configs + +# View deployment guides +docker model k8s guide + +# Deploy a configuration +docker model k8s deploy --config inference-scheduling --namespace vllm-inference +``` + +## Available Deployment Configurations + +1. **inference-scheduling** - Intelligent inference scheduling with load balancing +2. **pd-disaggregation** - Prefill/Decode disaggregation for better performance +3. **wide-ep** - Wide Expert-Parallelism for large MoE models +4. **simulated-accelerators** - Deploy with simulated accelerators for testing + +## Prerequisites + +- Kubernetes cluster (version 1.29 or later) +- kubectl configured to access your cluster +- helm (version 3.x or later) +- GPU support (NVIDIA, AMD, Google TPU, or Intel XPU) + +## Documentation + +For detailed deployment guides, see the `guides/` directory or run: + +```bash +docker model k8s guide +``` + +## Hardware Support + +vLLM on Kubernetes supports: +- NVIDIA GPUs (A100, H100, L4, etc.) +- AMD GPUs (MI250, MI300, etc.) +- Google TPUs (v5e and newer) +- Intel XPUs (Ponte Vecchio and newer) + +## Contributing + +This deployment configuration is based on production-tested patterns for running vLLM at scale. diff --git a/k8s/docker/Dockerfile.aws b/k8s/docker/Dockerfile.aws new file mode 100644 index 000000000..57a3a091d --- /dev/null +++ b/k8s/docker/Dockerfile.aws @@ -0,0 +1,489 @@ +ARG CUDA_MAJOR=12 +ARG CUDA_MINOR=8 +ARG CUDA_PATCH=1 + +# ============================================================================ +# BUILD STAGE - Install build dependencies and create wheels +# ============================================================================ +FROM nvcr.io/nvidia/cuda-dl-base:25.01-cuda12.8-devel-ubuntu24.04 AS builder + +ARG CUDA_MAJOR=12 +ARG CUDA_MINOR=8 +ARG CUDA_PATCH=1 +ARG PYTHON_VERSION + +ARG USE_SCCACHE=true + +COPY sccache.config.toml /tmp + +RUN --mount=type=secret,id=aws_access_key_id \ + --mount=type=secret,id=aws_secret_access_key \ + --mount=type=tmpfs,target=/root/.aws \ + if [ "${USE_SCCACHE}" = "true" ]; then \ + set -Eeuo pipefail && \ + # install sccache + apt-get update && apt-get install -y libssl-dev && \ + mkdir -p /tmp/sccache && \ + cd /tmp/sccache && \ + curl -sLO https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-x86_64-unknown-linux-musl.tar.gz && \ + tar -xf sccache-v0.10.0-x86_64-unknown-linux-musl.tar.gz && \ + mv sccache-v0.10.0-x86_64-unknown-linux-musl/sccache /usr/local/bin/sccache && \ + cd /tmp && \ + rm -rf /tmp/sccache && \ + # sccache configurations + mkdir -p $HOME/.config/sccache && \ + mv /tmp/sccache.config.toml $HOME/.config/sccache/config.toml && \ + export SCCACHE_CONF=$HOME/.config/sccache/config.toml && \ + # build out AWS creds and configs + umask 077 && \ + AKI="$(cat /run/secrets/aws_access_key_id)" && \ + SAK="$(cat /run/secrets/aws_secret_access_key)" && \ + { \ + echo "[default]"; \ + echo "region=us-west-2"; \ + echo "aws_access_key_id=${AKI}"; \ + echo "aws_secret_access_key=${SAK}"; \ + } > /root/.aws/credentials && \ + { \ + echo "[default]"; \ + echo "region=us-west-2"; \ + echo "output=json"; \ + } > /root/.aws/config && \ + export AWS_SHARED_CREDENTIALS_FILE=/root/.aws/credentials AWS_CONFIG_FILE=/root/.aws/config AWS_DEFAULT_REGION="us-west-2" AWS_PROFILE=default && \ + export SCCACHE_IDLE_TIMEOUT=0 && \ + export SCCACHE_LOG=debug && \ + export RUSTC_WRAPPER="sccache" && \ + /usr/local/bin/sccache --start-server && \ + # verify sccache is working with the s3 backend + /usr/local/bin/sccache --show-stats; \ + fi + +WORKDIR /workspace + +# Create UV constraint files +RUN cat > /tmp/build-constraints.txt <<'EOF' +torch==2.8.0 +EOF + +RUN cat > /tmp/constraints.txt <<'EOF' +torch==2.8.0 +EOF + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + UV_LINK_MODE=copy \ + TORCH_CUDA_ARCH_LIST="9.0a;10.0+PTX" \ + # TORCH_CUDA_ARCH_LIST="10.0+PTX" \ + PYTHON_VERSION=${PYTHON_VERSION:-3.12} \ + UV_TORCH_BACKEND=${UV_TORCH_BACKEND:-cu${CUDA_MAJOR}${CUDA_MINOR}} \ + UV_BUILD_CONSTRAINT=/tmp/build-constraints.txt \ + UV_CONSTRAINT=/tmp/constraints.txt \ + VIRTUAL_ENV=/opt/vllm + +# Update base packages +RUN apt-get -q update -y && apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa -y && apt-get update && apt-get clean all + +# Install base packages and EPEL in single layer +RUN DOWNLOAD_ARCH=""; \ + if [ "$(uname -m)" = "amd64" ] || [ "$(uname -m)" = "x86_64" ]; then \ + DOWNLOAD_ARCH="x86_64"; \ + fi; \ + apt-get -q install -y \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-dev \ + python${PYTHON_VERSION}-venv \ + python3.9-dev \ + python3.9-venv \ + which procps findutils tar \ + gcc g++ \ + make cmake \ + autoconf automake libtool \ + git \ + curl wget \ + gzip \ + zlib1g-dev \ + libssl-dev \ + pkg-config \ + uuid-dev \ + libc6-dev \ + cmake \ + libibverbs1 \ + ibverbs-providers \ + ibverbs-utils \ + libibumad-dev \ + libibverbs-dev \ + libnuma-dev \ + librdmacm-dev \ + rdma-core \ + subunit \ + pciutils \ + libpci3 \ + ninja-build \ + xz-utils \ + rsync \ + && apt-get clean all + +# Setup Python virtual environment +RUN python${PYTHON_VERSION} -m venv /opt/vllm && \ + ${VIRTUAL_ENV}/bin/pip install --progress-bar off --no-cache -U pip wheel uv meson-python ninja pybind11 build + +ENV LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib/x86_64-linux-gnu" \ + CPATH="/usr/include:/usr/local/include:/usr/local/cuda/include" \ + PKG_CONFIG_PATH="/usr/lib/pkgconfig:/usr/local/lib/pkgconfig:/usr/local/lib/x86_64-linux-gnu/pkgconfig" + +ARG NVSHMEM_VERSION=3.3.20 + +# Set NVSHMEM paths for CMake discovery +ENV NVSHMEM_DIR="/opt/nvshmem-${NVSHMEM_VERSION}" \ + PATH="/opt/nvshmem-${NVSHMEM_VERSION}/bin:${PATH}" \ + CPATH="/opt/nvshmem-${NVSHMEM_VERSION}/include:${CPATH}" \ + LIBRARY_PATH="/opt/nvshmem-${NVSHMEM_VERSION}/lib:${LIBRARY_PATH}" + +# Build and install gdrcopy +RUN --mount=type=cache,target=/var/cache/git \ + git clone https://github.com/NVIDIA/gdrcopy.git && \ + cd gdrcopy && \ + PREFIX=/usr/local DESTLIB=/usr/local/lib make lib_install && \ + cp src/libgdrapi.so.2.* /usr/local/lib/ && \ + ldconfig && \ + cd .. && rm -rf gdrcopy + +# Build and install UCX to consume version with EFA fix +# as not present in 1.19 +ARG UCX_VERSION="7ec95b95e524a87e81cac92f5ca8523e3966b16b" +RUN --mount=type=cache,target=/var/cache/git \ + git clone https://github.com/openucx/ucx.git && \ + cd ucx && \ + git checkout ${UCX_VERSION} && \ + ./autogen.sh && \ + ./contrib/configure-release \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-cma \ + --enable-devel-headers \ + --with-cuda=/usr/local/cuda \ + --with-verbs \ + --with-dm \ + --with-gdrcopy=/usr/local \ + --enable-mt \ + --with-efa \ + --prefix=/usr/local && \ + make -j$(nproc) && \ + make install-strip && \ + ldconfig && \ + cd .. && rm -rf ucx + +ENV CPPFLAGS="-I$NVSHMEM_DIR/include ${CPPFLAGS}" \ + LDFLAGS="-L$NVSHMEM_DIR/lib ${LDFLAGS}" + +# Build and install NVSHMEM from source with coreweave patch +RUN cd /tmp && \ + wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz -O nvshmem_src_cuda${CUDA_MAJOR}.tar.gz && \ + tar -xf nvshmem_src_cuda${CUDA_MAJOR}.tar.gz && \ + cd nvshmem_src && \ + mkdir build && \ + cd build && \ + cmake \ + -G Ninja \ + -DNVSHMEM_PREFIX=${NVSHMEM_DIR} \ + -DCMAKE_CUDA_ARCHITECTURES="90a;100" \ + -DNVSHMEM_PMIX_SUPPORT=0 \ + -DNVSHMEM_LIBFABRIC_SUPPORT=1 \ + -DNVSHMEM_IBRC_SUPPORT=1 \ + -DNVSHMEM_IBGDA_SUPPORT=1 \ + -DNVSHMEM_IBDEVX_SUPPORT=1 \ + -DNVSHMEM_SHMEM_SUPPORT=0 \ + -DNVSHMEM_USE_GDRCOPY=1 \ + -DNVSHMEM_MPI_SUPPORT=0 \ + -DNVSHMEM_USE_NCCL=0 \ + -DNVSHMEM_BUILD_TESTS=0 \ + -DNVSHMEM_BUILD_EXAMPLES=0 \ + -DGDRCOPY_HOME=/usr/local \ + -DLIBFABRIC_HOME=/opt/amazon/efa \ + -DNVSHMEM_DISABLE_CUDA_VMM=1 \ + .. && \ + ninja -j$(nproc) && \ + ninja install && \ + cd /tmp && rm -rf nvshmem_src* + +# Pin torch, so all deps are built against the same version +# as vllm itself +RUN --mount=type=cache,target=/root/.cache/uv \ + source ${VIRTUAL_ENV}/bin/activate && \ + uv pip install \ + # global + numpy torch \ + pyyaml \ + types-PyYAML \ + pytest \ + patchelf>=0.11.0 + +# build nixl from source to be able to use UCX from source +ARG NIXL_REPO_URL="https://github.com/ai-dynamo/nixl.git" +ARG NIXL_COMMIT_SHA="e20c39254a498cd7e7ea0ca945fd55fce52e85ea" +RUN --mount=type=cache,target=/tmp/nixl-cache \ + ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/lib64/ && \ + mkdir /opt/nixl && cd /opt/nixl && \ + git clone ${NIXL_REPO_URL} . && \ + git checkout ${NIXL_COMMIT_SHA} && \ + export PATH="${VIRTUAL_ENV}/bin:$PATH" && \ + export PYTHON="${VIRTUAL_ENV}/bin/python" && \ + export PKG_CONFIG_PATH="/usr/lib64/pkgconfig:/usr/share/pkgconfig:${PKG_CONFIG_PATH}" && \ + export CUDA_HOME="/usr/local/cuda" && \ + export LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs/:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/usr/local/lib/x86_64-linux-gnu:/usr/lib64:${LD_LIBRARY_PATH}" && \ + meson setup build --prefix=/usr/local -Dbuildtype=release && \ + cd build && \ + ninja && \ + ninja install && \ + cd .. && \ + # Build nixl wheel and install locally + source ${VIRTUAL_ENV}/bin/activate && \ + python -m build --no-isolation --wheel -o /wheels && \ + uv pip install --no-cache-dir . && \ + rm -rf build +RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/local.conf && \ + echo "/usr/local/lib/x86_64-linux-gnu" >> /etc/ld.so.conf.d/local.conf && \ + ldconfig + +WORKDIR /workspace + +# Define commit SHAs as build args to avoid layer invalidation +ARG LMCACHE_COMMIT_SHA="0db8ae4746a207a72483d716b5f82545a2ead24b" +ARG VLLM_COMMIT_SHA="f71952c1c49fb86686b0b300b727b26282362bf4" + +# Define if lmcache should be built +# Clone repositories with cache mounts +RUN --mount=type=cache,target=/var/cache/git \ + git clone https://github.com/neuralmagic/LMCache.git && \ + cd LMCache && \ + git checkout -q $LMCACHE_COMMIT_SHA && \ + cd .. && \ + # Build LMCache wheel + cd LMCache && \ + source ${VIRTUAL_ENV}/bin/activate && \ + uv build --wheel --no-build-isolation --out-dir /wheels && \ + cd .. + +# Use existing virtual environment at /opt/vllm +WORKDIR /workspace/vllm + +# Install DeepEP and DeepGEMM dependencies +ARG DEEPEP_URL="https://github.com/deepseek-ai/DeepEP" +ARG DEEPGEMM_URL="https://github.com/deepseek-ai/DeepGEMM" +ARG PPLX_KERNELS_URL="https://github.com/perplexityai/pplx-kernels" + +# Create wheels directory +RUN mkdir -p /wheels + +# Build compiled packages as wheels (only ones that need build tools) +RUN --mount=type=cache,target=/root/.cache/uv \ + source ${VIRTUAL_ENV}/bin/activate && \ + \ + # Install build tools + uv pip install build cuda-python numpy setuptools-scm ninja "nvshmem4py-cu${CUDA_MAJOR}" && \ + \ + # Build FlashInfer wheel + cd /tmp && \ + # Remove if already installed to prevent versioning issues + uv pip uninstall flashinfer-python || true && \ + git clone https://github.com/flashinfer-ai/flashinfer.git && \ + cd flashinfer && \ + uv pip install -e . --no-build-isolation && \ + uv build --wheel --no-build-isolation --out-dir /wheels && \ + cd .. && rm -rf flashinfer && \ + \ + # Build DeepEP wheel + git clone "${DEEPEP_URL}" deepep && \ + cd deepep && \ + uv build --wheel --no-build-isolation --out-dir /wheels && \ + cd .. && rm -rf deepep && \ + \ + # Build DeepGEMM wheel + git clone "${DEEPGEMM_URL}" deepgemm && \ + cd deepgemm && \ + # git checkout multi_arch_support && \ + git submodule update --init --recursive && \ + uv build --wheel --no-build-isolation --out-dir /wheels && \ + cd .. && rm -rf deepgemm && \ + \ + # Build pplx-kernels wheel + git clone ${PPLX_KERNELS_URL} pplx-kernels && \ + cd pplx-kernels && \ + # git checkout build-fixes && \ + NVSHMEM_PREFIX=${NVSHMEM_DIR} uv build --wheel --out-dir /wheels && \ + cd .. && rm -rf pplx-kernels + +# verify builds are hitting the cache +# RUN sccache --show-stats + +# ============================================================================ +# RUNTIME STAGE - Minimal runtime image +# ============================================================================ +# FROM nvcr.io/nvidia/cuda:${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH}-devel-ubi9 AS runtime +FROM nvcr.io/nvidia/cuda:${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH}-devel-ubuntu24.04 AS runtime + +ARG CUDA_MAJOR=12 +ARG CUDA_MINOR=8 +ARG CUDA_PATCH=1 +ARG PYTHON_VERSION +ARG NVSHMEM_VERSION=3.3.20 + +RUN cat > /tmp/constraints.txt <<'EOF' +torch==2.8.0 +EOF + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + PYTHON_VERSION=${PYTHON_VERSION:-3.12} \ + UV_TORCH_BACKEND=${UV_TORCH_BACKEND:-cu${CUDA_MAJOR}${CUDA_MINOR}} \ + UV_CONSTRAINT=/tmp/constraints.txt \ + VIRTUAL_ENV=/opt/vllm \ + NVSHMEM_DIR="/opt/nvshmem-${NVSHMEM_VERSION}" \ + # LD_LIBRARY_PATH needs the torch path to apply proper linkers so as not to produce torch ABI missmatch + LD_LIBRARY_PATH="/usr/lib:/usr/local/lib/:/opt/vllm/lib/python${PYTHON_VERSION}/site-packages/torch/lib:/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib/x86_64-linux-gnu:/opt/nvshmem-${NVSHMEM_VERSION}/lib:${LD_LIBRARY_PATH}" \ + PATH="/opt/nvshmem-${NVSHMEM_VERSION}/bin:${PATH}" \ + CPATH="/opt/nvshmem-${NVSHMEM_VERSION}/include:${CPATH}" \ + TORCH_CUDA_ARCH_LIST="9.0a;10.0+PTX" + +# Update base packages +RUN apt-get -q update -y && apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa -y && apt-get update && apt-get clean all + +# Install only runtime dependencies +RUN apt-get update -y && apt-get install -y \ + python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + rdma-core \ + pciutils \ + procps \ + git \ + curl \ + libibverbs1 \ + ibverbs-providers \ + ibverbs-utils \ + libibumad3 \ + libnuma1 \ + librdmacm1 \ + gcc && apt-get clean all + +# Copy UCX libraries from builder +COPY --from=builder /usr/local/lib/libucp* /usr/local/lib/ +COPY --from=builder /usr/local/lib/libucs* /usr/local/lib/ +COPY --from=builder /usr/local/lib/libuct* /usr/local/lib/ +COPY --from=builder /usr/local/lib/libucm* /usr/local/lib/ +COPY --from=builder /usr/local/lib/ucx/ /usr/local/lib/ucx/ +COPY --from=builder /usr/local/bin/ucx* /usr/local/bin/ + +# Copy gdrcopy libraries from builder +COPY --from=builder /usr/local/lib/libgdrapi.so.2.* /usr/local/lib/ +COPY --from=builder /usr/local/lib/libgdrapi.so* /usr/local/lib/ + +RUN + +# Copy nixl libraries from builder +COPY --from=builder /usr/local/lib/x86_64-linux-gnu/libnixl* /usr/lib/ +COPY --from=builder /usr/local/lib/x86_64-linux-gnu/libstream.so /usr/lib/ +COPY --from=builder /usr/local/lib/x86_64-linux-gnu/libserdes.so /usr/lib/ +COPY --from=builder /usr/local/lib/x86_64-linux-gnu/plugins/ /usr/lib/plugins/ +COPY --from=builder /usr/local/include/nixl* /usr/local/include/ + +# Copy compiled NVSHMEM libraries from builder +COPY --from=builder /opt/nvshmem-${NVSHMEM_VERSION}/ /opt/nvshmem-${NVSHMEM_VERSION}/ + +# Setup ldconfig and library paths +RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/local.conf && \ + echo "/usr/lib" >> /etc/ld.so.conf.d/local.conf && \ + echo "/usr/lib/x86_64-linux-gnu/libibverbs" >> /etc/ld.so.conf.d/local.conf && \ + echo "/opt/nvshmem-${NVSHMEM_VERSION}/lib" >> /etc/ld.so.conf.d/local.conf && \ + echo "/opt/vllm/lib64/python3.12/site-packages/.nixl.mesonpy.libs/plugins" >> /etc/ld.so.conf.d/local.conf && \ + ldconfig + +# Setup Python virtual environment +RUN python${PYTHON_VERSION} -m venv /opt/vllm && \ + ${VIRTUAL_ENV}/bin/pip install --no-cache -U pip wheel uv + +# Copy compiled wheels +COPY --from=builder /wheels/*.whl /tmp/wheels/ + +# Define commit SHAs as build args to avoid layer invalidation +ARG LMCACHE_COMMIT_SHA="0db8ae4746a207a72483d716b5f82545a2ead24b" +ARG VLLM_COMMIT_SHA="f71952c1c49fb86686b0b300b727b26282362bf4" + +ARG VLLM_PREBUILT=0 + +# Public LLM-D vllm wheels index +ARG VLLM_WHEEL_URL="https://gitlab.com/api/v4/projects/72482892/packages/pypi/simple" + +# Install PyTorch and cuda-python +# Install all compiled wheels (DeepEP, DeepGEMM, pplx-kernels, LMCache, nixl) +# Installs vllm source editably (unless using custom prebuilt) for dev experience + +# Install PyTorch and cuda-python +# Install all compiled wheels (DeepEP, DeepGEMM, pplx-kernels, LMCache, nixl) +# Installs vllm source editably (unless using custom prebuilt) for dev experience +RUN --mount=type=cache,target=/var/cache/git \ + . /opt/vllm/bin/activate && \ + uv pip install "nvshmem4py-cu${CUDA_MAJOR}" cuda-python 'huggingface_hub[hf_xet]' && \ + uv pip install /tmp/wheels/*.whl && \ + git clone https://github.com/vllm-project/vllm.git /opt/vllm-source && \ + cd /opt/vllm-source && \ + git checkout -q ${VLLM_COMMIT_SHA}; \ + VLLM_WHEEL_ARCH=""; \ + if [ "$(uname -m)" = "amd64" ] || [ "$(uname -m)" = "x86_64" ]; then \ + VLLM_WHEEL_ARCH="x86_64"; \ + fi; \ + if [ "${VLLM_PREBUILT}" = "1" ] && [ -n "${VLLM_WHEEL_URL}" ]; then \ + VLLM_COMMIT_SHA_SHORT="g$(git rev-parse --short HEAD)"; \ + CUDA_SHORT="cu${CUDA_MAJOR}${CUDA_MINOR}"; \ + export VLLM_WHEEL_VERSION="0.0.0+${VLLM_COMMIT_SHA_SHORT}.${CUDA_SHORT}"; \ + VLLM_USE_PRECOMPILED=1 uv pip install --index-url "${VLLM_WHEEL_URL}" "vllm==${VLLM_WHEEL_VERSION}"; \ + else \ + VLLM_COMMIT="$(git merge-base HEAD origin/main)"; \ + VLLM_PRECOMPILED_WHEEL_LOCATION="https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_${VLLM_WHEEL_ARCH}.whl"; \ + VLLM_USE_PRECOMPILED=1 uv pip install --editable .; \ + fi; \ + uv pip install "nvidia-nccl-cu12>=2.26.2.post1" && \ + rm -rf /tmp/wheels + +RUN apt-get autoremove -y && apt-get clean all + +# setup non-root user for OpenShift +RUN umask 002 && \ + useradd --uid 2000 --gid 0 vllm && \ + rm -rf /home/vllm && \ + mkdir -p /home/vllm && \ + chown vllm:root /home/vllm && \ + chmod g+rwx /home/vllm + +# default openionated env vars for HF_HOME and TRANSFORMERS_CACHE, over-writeable +ENV LLM_D_MODELS_DIR=/var/lib/llm-d/models \ + HF_HOME=/var/lib/llm-d/.hf \ + TRANSFORMERS_CACHE=/var/lib/llm-d/.cache/huggingface + +# creates default models directory and makes path writeable for both root and default user, with symlink for convenience +# find command keeps group=0 on all new subdirs created later +RUN mkdir -p "$LLM_D_MODELS_DIR" "$HF_HOME" "$TRANSFORMERS_CACHE" && \ + chown -R root:0 /var/lib/llm-d && \ + chmod -R g+rwX /var/lib/llm-d && \ + find /var/lib/llm-d -type d -exec chmod g+s {} \; && \ + ln -snf /var/lib/llm-d/models /models + +ENV PATH="${VIRTUAL_ENV}/bin:/usr/local/nvidia/bin:${PATH}" \ + HOME=/home/vllm \ + VLLM_USAGE_SOURCE=production-docker-image \ + VLLM_WORKER_MULTIPROC_METHOD=fork \ + OUTLINES_CACHE_DIR=/tmp/outlines \ + NUMBA_CACHE_DIR=/tmp/numba \ + TRITON_CACHE_DIR=/tmp/triton \ + TRITON_LIBCUDA_PATH=/usr/lib64 \ + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=15 \ + TORCH_NCCL_DUMP_ON_TIMEOUT=0 \ + VLLM_SKIP_P2P_CHECK=1 \ + VLLM_CACHE_ROOT=/tmp/vllm \ + # NOTE: workaround for hang with XET. This should be removed + # in a future release when we get to the bottom of the issue. + HF_HUB_DISABLE_XET=1 + +USER 2000 +WORKDIR /home/vllm + +ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/k8s/docker/Dockerfile.cuda b/k8s/docker/Dockerfile.cuda new file mode 100644 index 000000000..3bc72e0ad --- /dev/null +++ b/k8s/docker/Dockerfile.cuda @@ -0,0 +1,486 @@ +ARG CUDA_MAJOR=12 +ARG CUDA_MINOR=9 +ARG CUDA_PATCH=1 + +# ============================================================================ +# BUILD STAGE - Install build dependencies and create wheels +# ============================================================================ +FROM nvcr.io/nvidia/cuda:${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH}-devel-ubi9 AS builder + +ARG CUDA_MAJOR=12 +ARG CUDA_MINOR=9 +ARG CUDA_PATCH=1 +ARG PYTHON_VERSION + +ARG USE_SCCACHE=true + +RUN cat > /usr/local/bin/setup-sccache <<'EOF' +#!/bin/bash +if [ "${USE_SCCACHE}" = "true" ]; then + # Set up AWS credentials if secrets are available + if [ -f "/run/secrets/aws_access_key_id" ] && [ -f "/run/secrets/aws_secret_access_key" ]; then + export AWS_ACCESS_KEY_ID="$(cat /run/secrets/aws_access_key_id)" + export AWS_SECRET_ACCESS_KEY="$(cat /run/secrets/aws_secret_access_key)" + export AWS_DEFAULT_REGION="us-west-2" + fi + + export CMAKE_C_COMPILER_LAUNCHER=sccache + export CMAKE_CXX_COMPILER_LAUNCHER=sccache + export CMAKE_CUDA_COMPILER_LAUNCHER=sccache + + # Configure sccache via environment variables + export SCCACHE_BUCKET="vllm-nightly-sccache" + export SCCACHE_REGION="us-west-2" + export SCCACHE_S3_KEY_PREFIX="llm-d-cache/" + export SCCACHE_IDLE_TIMEOUT=0 + + if ! /usr/local/bin/sccache --start-server; then + echo "Warning: sccache failed to start, continuing without cache" >&2 + unset CMAKE_C_COMPILER_LAUNCHER CMAKE_CXX_COMPILER_LAUNCHER CMAKE_CUDA_COMPILER_LAUNCHER + return 1 + fi + + if ! /usr/local/bin/sccache --show-stats >/dev/null 2>&1; then + echo "Warning: sccache not responding properly, disabling cache" >&2 + /usr/local/bin/sccache --stop-server 2>/dev/null || true + unset CMAKE_C_COMPILER_LAUNCHER CMAKE_CXX_COMPILER_LAUNCHER CMAKE_CUDA_COMPILER_LAUNCHER + return 1 + fi + + echo "sccache successfully configured with cache prefix: ${SCCACHE_S3_KEY_PREFIX}" +fi +EOF +RUN chmod +x /usr/local/bin/setup-sccache + +RUN --mount=type=secret,id=aws_access_key_id \ + --mount=type=secret,id=aws_secret_access_key \ + if [ "${USE_SCCACHE}" = "true" ]; then \ + set -Eeuo pipefail && \ + dnf install -y openssl-devel && \ + mkdir -p /tmp/sccache && \ + cd /tmp/sccache && \ + curl -sLO https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-x86_64-unknown-linux-musl.tar.gz && \ + tar -xf sccache-v0.10.0-x86_64-unknown-linux-musl.tar.gz && \ + mv sccache-v0.10.0-x86_64-unknown-linux-musl/sccache /usr/local/bin/sccache && \ + cd /tmp && \ + rm -rf /tmp/sccache && \ + source /usr/local/bin/setup-sccache && \ + echo "int main() { return 0; }" | sccache gcc -x c - -o /dev/null && \ + echo "sccache installation and S3 connectivity verified"; \ + fi + +WORKDIR /workspace + +# Create UV constraint files +RUN cat > /tmp/build-constraints.txt <<'EOF' +torch==2.8.0 +EOF + +RUN cat > /tmp/constraints.txt <<'EOF' +torch==2.8.0 +EOF + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + UV_LINK_MODE=copy \ + TORCH_CUDA_ARCH_LIST="9.0a;10.0+PTX" \ + PYTHON_VERSION=${PYTHON_VERSION:-3.12} \ + UV_TORCH_BACKEND=${UV_TORCH_BACKEND:-cu${CUDA_MAJOR}${CUDA_MINOR}} \ + UV_BUILD_CONSTRAINT=/tmp/build-constraints.txt \ + UV_CONSTRAINT=/tmp/constraints.txt \ + VIRTUAL_ENV=/opt/vllm + +# Update base packages +RUN dnf -q update -y && dnf clean all + +# Install base packages and EPEL in single layer +RUN dnf -q install -y dnf-plugins-core && \ + dnf -q install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + dnf config-manager --set-enabled epel && \ + DOWNLOAD_ARCH=""; \ + if [ "$(uname -m)" = "amd64" ] || [ "$(uname -m)" = "x86_64" ]; then \ + DOWNLOAD_ARCH="x86_64"; \ + fi; \ + dnf config-manager --add-repo "https://developer.download.nvidia.com/compute/cuda/repos/rhel9/${DOWNLOAD_ARCH}/cuda-rhel9.repo" && \ + dnf -q install -y --allowerasing \ + python${PYTHON_VERSION} python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \ + python${PYTHON_VERSION}-devel \ + python3.9-devel \ + which procps findutils tar \ + gcc gcc-c++ \ + make cmake \ + autoconf automake libtool \ + git \ + curl wget \ + gzip \ + zlib-devel \ + openssl-devel \ + pkg-config \ + libuuid-devel \ + glibc-devel \ + rdma-core-devel \ + libibverbs \ + libibverbs-devel \ + numactl-libs \ + subunit \ + pciutils \ + pciutils-libs \ + ninja-build \ + xz \ + rsync \ + && dnf clean all + +# Setup Python virtual environment +RUN python${PYTHON_VERSION} -m venv /opt/vllm && \ + ${VIRTUAL_ENV}/bin/pip install --progress-bar off --no-cache -U pip wheel uv meson-python ninja pybind11 build + +ENV LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64" \ + CPATH="/usr/include:/usr/local/include:/usr/local/cuda/include" \ + PKG_CONFIG_PATH="/usr/lib/pkgconfig:/usr/local/lib/pkgconfig:/usr/local/lib64/pkgconfig" + +ARG NVSHMEM_VERSION=3.3.20 + +# Set NVSHMEM paths for CMake discovery +ENV NVSHMEM_DIR="/opt/nvshmem-${NVSHMEM_VERSION}" \ + PATH="/opt/nvshmem-${NVSHMEM_VERSION}/bin:${PATH}" \ + CPATH="/opt/nvshmem-${NVSHMEM_VERSION}/include:${CPATH}" \ + LIBRARY_PATH="/opt/nvshmem-${NVSHMEM_VERSION}/lib:${LIBRARY_PATH}" + +# Build and install gdrcopy +# TODO: CUSTOM LOGGING +RUN --mount=type=cache,target=/var/cache/git \ + --mount=type=secret,id=aws_access_key_id \ + --mount=type=secret,id=aws_secret_access_key \ + source /usr/local/bin/setup-sccache && \ + git clone https://github.com/NVIDIA/gdrcopy.git && \ + cd gdrcopy && \ + CC="sccache gcc" CXX="sccache g++" PREFIX=/usr/local DESTLIB=/usr/local/lib make lib_install && \ + cp src/libgdrapi.so.2.* /usr/lib64/ && \ + ldconfig && \ + cd .. && rm -rf gdrcopy && \ + if [ "${USE_SCCACHE}" = "true" ]; then \ + echo "=== gdrcopy build complete - sccache stats ===" && \ + sccache --show-stats; \ + fi + +ENV CPPFLAGS="-I$NVSHMEM_DIR/include ${CPPFLAGS}" \ + LDFLAGS="-L$NVSHMEM_DIR/lib ${LDFLAGS}" + +# Create wheels directory +RUN mkdir -p /wheels + +# Copy patches before build +COPY patches/ /tmp/patches/ + +# Build and install NVSHMEM from source with coreweave patch +RUN --mount=type=secret,id=aws_access_key_id \ + --mount=type=secret,id=aws_secret_access_key \ + cd /tmp && \ + source /usr/local/bin/setup-sccache && \ + wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz -O nvshmem_src_cuda${CUDA_MAJOR}.tar.gz && \ + tar -xf nvshmem_src_cuda${CUDA_MAJOR}.tar.gz && \ + cd nvshmem_src && \ + git apply /tmp/patches/cks_nvshmem${NVSHMEM_VERSION}.patch && \ + mkdir build && \ + cd build && \ + cmake \ + -G Ninja \ + -DNVSHMEM_PREFIX=${NVSHMEM_DIR} \ + -DCMAKE_CUDA_ARCHITECTURES="90a;100" \ + -DNVSHMEM_PMIX_SUPPORT=0 \ + -DNVSHMEM_LIBFABRIC_SUPPORT=0 \ + -DNVSHMEM_IBRC_SUPPORT=1 \ + -DNVSHMEM_IBGDA_SUPPORT=1 \ + -DNVSHMEM_IBDEVX_SUPPORT=1 \ + -DNVSHMEM_SHMEM_SUPPORT=0 \ + -DNVSHMEM_USE_GDRCOPY=1 \ + -DNVSHMEM_MPI_SUPPORT=0 \ + -DNVSHMEM_USE_NCCL=0 \ + -DNVSHMEM_BUILD_TESTS=0 \ + -DNVSHMEM_BUILD_EXAMPLES=0 \ + -DGDRCOPY_HOME=/usr/local \ + -DNVSHMEM_DISABLE_CUDA_VMM=1 \ + .. && \ + ninja -j$(nproc) && \ + ninja install && \ + cp ${NVSHMEM_DIR}/lib/python/dist/nvshmem4py_cu${CUDA_MAJOR}-*-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}-manylinux_*.whl /wheels/ && \ + cd /tmp && rm -rf nvshmem_src* && \ + if [ "${USE_SCCACHE}" = "true" ]; then \ + echo "=== NVSHMEM build complete - sccache stats ===" && \ + sccache --show-stats; \ + fi + +# Pin torch, so all deps are built against the same version +# as vllm itself +RUN --mount=type=cache,target=/root/.cache/uv \ + source ${VIRTUAL_ENV}/bin/activate && \ + uv pip install \ + # global + numpy torch \ + pyyaml \ + types-PyYAML \ + pytest \ + patchelf>=0.11.0 + +RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/local.conf && \ + echo "/usr/local/lib64" >> /etc/ld.so.conf.d/local.conf && \ + ldconfig + +WORKDIR /workspace + +# Define commit SHAs as build args to avoid layer invalidation +ARG LMCACHE_REPO=https://github.com/neuralmagic/LMCache.git +ARG LMCACHE_COMMIT_SHA="0db8ae4746a207a72483d716b5f82545a2ead24b" +ARG VLLM_COMMIT_SHA="f71952c1c49fb86686b0b300b727b26282362bf4" + +# Define if lmcache should be built +# Clone repositories with cache mounts +RUN --mount=type=cache,target=/var/cache/git \ + git clone "${LMCACHE_REPO}" LMCache && \ + cd LMCache && \ + git checkout -q "${LMCACHE_COMMIT_SHA}" && \ + cd .. && \ + # Build LMCache wheel + cd LMCache && \ + source ${VIRTUAL_ENV}/bin/activate && \ + uv build --wheel --no-build-isolation --out-dir /wheels && \ + cd .. + +# Use existing virtual environment at /opt/vllm +WORKDIR /workspace/vllm + +# set kernel library dependencies +# note: these libraries don't yet push sdist releases to pypi +# so down below we do a git clone +ARG DEEPEP_REPO="https://github.com/deepseek-ai/DeepEP" +ARG DEEPEP_VERSION="v1.2.1" +ARG DEEPGEMM_REPO="https://github.com/deepseek-ai/DeepGEMM" +ARG DEEPGEMM_VERSION="v2.1.0" +ARG PPLX_KERNELS_REPO="https://github.com/perplexityai/pplx-kernels" +ARG PPLX_KERNELS_SHA="12cecfda252e4e646417ac263d96e994d476ee5d" + +ARG FLASHINFER_VERSION="v0.3.1" + +# Build compiled packages as wheels (only ones that need build tools) +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=secret,id=aws_access_key_id \ + --mount=type=secret,id=aws_secret_access_key \ + source ${VIRTUAL_ENV}/bin/activate && \ + source /usr/local/bin/setup-sccache && \ + \ + # Install build tools + uv pip install build cuda-python numpy setuptools-scm ninja && \ + uv pip install /wheels/nvshmem4py_cu${CUDA_MAJOR}-*.whl && \ + \ + # Build FlashInfer wheel + cd /tmp && \ + # Remove if already installed to prevent versioning issues + uv pip uninstall flashinfer-python || true && \ + git clone https://github.com/flashinfer-ai/flashinfer.git && \ + cd flashinfer && \ + git checkout -q "${FLASHINFER_VERSION}" && \ + uv build --wheel --no-build-isolation --out-dir /wheels && \ + cd .. && rm -rf flashinfer && \ + \ + # Build DeepEP wheel + git clone "${DEEPEP_REPO}" deepep && \ + cd deepep && \ + git checkout -q "${DEEPEP_VERSION}" && \ + uv build --wheel --no-build-isolation --out-dir /wheels && \ + cd .. && rm -rf deepep && \ + \ + # Build DeepGEMM wheel + git clone "${DEEPGEMM_REPO}" deepgemm && \ + cd deepgemm && \ + git checkout -q "${DEEPGEMM_VERSION}" && \ + git submodule update --init --recursive && \ + uv build --wheel --no-build-isolation --out-dir /wheels && \ + cd .. && rm -rf deepgemm && \ + \ + # Build pplx-kernels wheel + git clone ${PPLX_KERNELS_REPO} pplx-kernels && \ + cd pplx-kernels && \ + git checkout ${PPLX_KERNELS_SHA} && \ + NVSHMEM_PREFIX=${NVSHMEM_DIR} uv build --wheel --out-dir /wheels && \ + cd .. && rm -rf pplx-kernels && \ + if [ "${USE_SCCACHE}" = "true" ]; then \ + echo "=== Compiled wheels build complete - sccache stats ===" && \ + sccache --show-stats; \ + fi + +# ============================================================================ +# RUNTIME STAGE - Minimal runtime image +# ============================================================================ +FROM nvcr.io/nvidia/cuda:${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH}-devel-ubi9 AS runtime + +ARG CUDA_MAJOR=12 +ARG CUDA_MINOR=9 +ARG CUDA_PATCH=1 +ARG PYTHON_VERSION +ARG NVSHMEM_VERSION=3.3.20 + +RUN cat > /tmp/constraints.txt <<'EOF' +torch==2.8.0 +EOF + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + PYTHON_VERSION=${PYTHON_VERSION:-3.12} \ + UV_TORCH_BACKEND=${UV_TORCH_BACKEND:-cu${CUDA_MAJOR}${CUDA_MINOR}} \ + UV_CONSTRAINT=/tmp/constraints.txt \ + VIRTUAL_ENV=/opt/vllm \ + NVSHMEM_DIR="/opt/nvshmem-${NVSHMEM_VERSION}" \ + # LD_LIBRARY_PATH needs the torch path to apply proper linkers so as not to produce torch ABI missmatch + LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:/opt/nvshmem-${NVSHMEM_VERSION}/lib:${LD_LIBRARY_PATH}" \ + PATH="/opt/nvshmem-${NVSHMEM_VERSION}/bin:${PATH}" \ + CPATH="/opt/nvshmem-${NVSHMEM_VERSION}/include:${CPATH}" \ + TORCH_CUDA_ARCH_LIST="9.0a;10.0+PTX" + +# Update base packages +RUN dnf update -y && dnf clean all + +# Install only runtime dependencies +RUN dnf install -y --allowerasing \ + python${PYTHON_VERSION} python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-devel \ + rdma-core-devel \ + numactl-libs \ + pciutils \ + procps-ng \ + git \ + curl \ + jq \ + gcc && dnf clean all + +# Copy gdrcopy libraries from builder +COPY --from=builder /usr/lib64/libgdrapi.so.2.* /usr/lib64/ +COPY --from=builder /usr/local/lib/libgdrapi.so* /usr/local/lib/ + +# Copy compiled NVSHMEM libraries from builder +COPY --from=builder /opt/nvshmem-${NVSHMEM_VERSION}/ /opt/nvshmem-${NVSHMEM_VERSION}/ + +# Setup ldconfig and library paths +RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/local.conf && \ + echo "/usr/local/lib64" >> /etc/ld.so.conf.d/local.conf && \ + echo "/opt/nvshmem-${NVSHMEM_VERSION}/lib" >> /etc/ld.so.conf.d/local.conf && \ + ldconfig + +# Setup Python virtual environment +RUN python${PYTHON_VERSION} -m venv /opt/vllm && \ + ${VIRTUAL_ENV}/bin/pip install --no-cache -U pip wheel uv + +# Copy compiled wheels +COPY --from=builder /wheels/*.whl /tmp/wheels/ + +# Create the vllm user +RUN useradd --uid 2000 --gid 0 vllm && \ + touch /home/vllm/.bashrc + +# Create the vllm workspace with permissions to swap commits and remotes +RUN mkdir -p /opt/vllm-source && \ + chown -R 2000:0 /opt/vllm-source && \ + chmod -R g+rwX /opt/vllm-source && \ + find /opt/vllm-source -type d -exec chmod g+s {} \; && \ + setfacl -R -m g:0:rwX -m d:g:0:rwX /opt/vllm-source || true + +# Define commit SHAs as build args to avoid layer invalidation +ARG LMCACHE_COMMIT_SHA="0db8ae4746a207a72483d716b5f82545a2ead24b" +ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" +ARG VLLM_COMMIT_SHA="f71952c1c49fb86686b0b300b727b26282362bf4" + +# Dictates if we should pull a production wheel for the vllm commit sha +ARG VLLM_PREBUILT=0 +# Dictates if we should pull precompiled binaries when installing vllm editably. These commits must be on main in vLLM. +ARG VLLM_USE_PRECOMPILED=1 + +ADD scripts/warn-vllm-precompiled.sh /opt/ + +# Install cuda-python, nvshmem python bindings, xet +# Install all compiled wheels (DeepEP, DeepGEMM, pplx-kernels, LMCache, nixl) +# Installs vllm source. Supports three install modes: + # 1) install vllm from source totally editably - dev option, supports fork and commit swapping + # 3) install vllm from source with pulling precomiled binaries (shared libraries) from the vllm wheels index - less flexible dev option, may or may not work swapping shas but faster than full editable + # 2) install vllm from a wheel on the vllm wheels index - prod option, no flexbility +RUN --mount=type=cache,target=/var/cache/git \ + source /opt/vllm/bin/activate && \ + uv pip install nixl cuda-python 'huggingface_hub[hf_xet]' && \ + uv pip install /tmp/wheels/*.whl && \ + git clone "${VLLM_REPO}" /opt/vllm-source && \ + git -C /opt/vllm-source config --system --add safe.directory /opt/vllm-source && \ + git -C /opt/vllm-source fetch --depth=1 origin "${VLLM_COMMIT_SHA}" || true && \ + git -C /opt/vllm-source checkout -q "${VLLM_COMMIT_SHA}" && \ + export WHEEL_URL=$(pip install \ + --no-cache-dir \ + --no-index \ + --no-deps \ + --find-links "https://wheels.vllm.ai/${VLLM_COMMIT_SHA}/vllm/" \ + --only-binary=:all: \ + --pre vllm \ + --dry-run \ + --disable-pip-version-check \ + -qqq \ + --report - \ + 2>/dev/null | jq -r '.install[0].download_info.url'); \ + if [ "${VLLM_PREBUILT}" = "1" ]; then \ + if [ -z "${WHEEL_URL}" ]; then \ + echo "VLLM_PREBUILT set but no platform compatible wheel exists for: https://wheels.vllm.ai/${VLLM_COMMIT_SHA}/vllm/"; \ + exit 1; \ + fi; \ + uv pip install "${WHEEL_URL}"; \ + rm /opt/warn-vllm-precompiled.sh; \ + else \ + if [ "${VLLM_USE_PRECOMPILED}" = "1" ] && [ -n "${WHEEL_URL}" ]; then \ + echo "Using precompiled binaries and shared libraries for commit: ${VLLM_COMMIT_SHA}."; \ + export VLLM_USE_PRECOMPILED=1; \ + export VLLM_PRECOMPILED_WHEEL_LOCATION="${WHEEL_URL}"; \ + uv pip install -e /opt/vllm-source; \ + /opt/warn-vllm-precompiled.sh; \ + rm /opt/warn-vllm-precompiled.sh; \ + else \ + echo "Compiling fully from source. Either precompile disabled or wheel not found in index from main."; \ + unset VLLM_USE_PRECOMPILED VLLM_PRECOMPILED_WHEEL_LOCATION || true; \ + uv pip install -e /opt/vllm-source; \ + rm /opt/warn-vllm-precompiled.sh; \ + fi; \ + fi; \ + uv pip install "nvidia-nccl-cu12>=2.26.2.post1" && \ + rm -rf /tmp/wheels + +RUN dnf autoremove -y && dnf clean all + +# setup non-root user for OpenShift +RUN umask 002 && \ + rm -rf /home/vllm && \ + mkdir -p /home/vllm && \ + chown vllm:root /home/vllm && \ + chmod g+rwx /home/vllm + +# default openionated env var for HF_HOME, over-writeable +ENV LLM_D_MODELS_DIR=/var/lib/llm-d/models \ + HF_HOME=/var/lib/llm-d/.hf + +# creates default models directory and makes path writeable for both root and default user, with symlink for convenience +# find command keeps group=0 on all new subdirs created later +RUN mkdir -p "$LLM_D_MODELS_DIR" "$HF_HOME" && \ + chown -R root:0 /var/lib/llm-d && \ + chmod -R g+rwX /var/lib/llm-d && \ + find /var/lib/llm-d -type d -exec chmod g+s {} \; && \ + ln -snf /var/lib/llm-d/models /models + +ENV PATH="${VIRTUAL_ENV}/bin:/usr/local/nvidia/bin:${PATH}" \ + HOME=/home/vllm \ + VLLM_USAGE_SOURCE=production-docker-image \ + VLLM_WORKER_MULTIPROC_METHOD=fork \ + OUTLINES_CACHE_DIR=/tmp/outlines \ + NUMBA_CACHE_DIR=/tmp/numba \ + TRITON_CACHE_DIR=/tmp/triton \ + TRITON_LIBCUDA_PATH=/usr/lib64 \ + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=15 \ + TORCH_NCCL_DUMP_ON_TIMEOUT=0 \ + VLLM_SKIP_P2P_CHECK=1 \ + VLLM_CACHE_ROOT=/tmp/vllm \ + UCX_MEM_MMAP_HOOK_MODE=none + +USER 2000 +WORKDIR /home/vllm + +ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/k8s/docker/Dockerfile.gke b/k8s/docker/Dockerfile.gke new file mode 100644 index 000000000..08b116eb2 --- /dev/null +++ b/k8s/docker/Dockerfile.gke @@ -0,0 +1,197 @@ +# Dockerfile for llm-d on GKE +# This image works around an issue with UBI RDMA drivers and NVSHMEM +# which has not yet been resolved. + +# Use a CUDA base image. +FROM docker.io/nvidia/cuda:12.9.1-devel-ubuntu22.04 AS base + +WORKDIR /app + +ENV CUDA_MAJOR=12 +ENV CUDA_MINOR=9 +ENV PYTHON_VERSION=3.12 +ENV UCX_VERSION=1.19.0 +ENV UCX_HOME=/opt/ucx +ENV CUDA_HOME=/usr/local/cuda/ +ENV GDRCOPY_VERSION=2.5.1 +ENV GDRCOPY_HOME=/usr/local +ENV NVSHMEM_VERSION=3.3.20 +ENV NVSHMEM_PREFIX=/usr/local/nvshmem +ENV TORCH_CUDA_ARCH_LIST="9.0a 10.0" +ENV CMAKE_CUDA_ARCHITECTURES="90a;100" +# Work around https://github.com/vllm-project/vllm/issues/18859 and mount gIB if they +# are found for NCCL. +ENV LD_LIBRARY_PATH=/usr/local/gib/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}: +# For neovim.appimage +ENV APPIMAGE_EXTRACT_AND_RUN=1 +ENV DEBIAN_FRONTEND=noninteractive + +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select New_York' | debconf-set-selections \ + && apt-get -qq update \ + && apt-get -qq install -y ccache software-properties-common git wget curl \ + && for i in 1 2 3; do \ + add-apt-repository -y ppa:deadsnakes/ppa && break || \ + { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ + done \ + # Mellanox OFED + && wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add - \ + && cd /etc/apt/sources.list.d/ && wget https://linux.mellanox.com/public/repo/mlnx_ofed/24.10-0.7.0.0/ubuntu22.04/mellanox_mlnx_ofed.list \ + # Update all + && apt-get -qq update \ + && apt-get -qq install -y --no-install-recommends \ + # Python and related tools + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-dev \ + python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-dbg \ + ca-certificates \ + htop \ + iputils-ping net-tools dnsutils \ + vim ripgrep bat clangd fuse fzf \ + nodejs npm clang fd-find xclip \ + zsh \ + # Build tools for UCX, NVSHMEM, etc. + build-essential \ + autoconf automake libtool pkg-config \ + ninja-build cmake \ + # Other dependencies + libnuma1 libsubunit0 libpci-dev \ + # NVSHMEM dependency + datacenter-gpu-manager \ + # Allows NVSHMEM to build nvshmem4py + python3.10-venv python3.10-dev \ + # Debugging tools + kmod pciutils binutils \ + gdb strace lsof \ + # GCP leverages these libraries for NCCL initialization + libnl-3-200 libnl-route-3-200 \ + # Mellanox OFED + ibverbs-utils libibumad3 \ + # Debugging tools for RDMA + rdmacm-utils ibverbs-utils libibumad-dev librdmacm-dev infiniband-diags libibverbs-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && python${PYTHON_VERSION} -m ensurepip --upgrade \ + && python${PYTHON_VERSION} -m pip install --upgrade pip setuptools wheel + +# --- Build and Install GDRCopy from Source --- +RUN cd /tmp && \ + git clone https://github.com/NVIDIA/gdrcopy.git && \ + cd gdrcopy && \ + git checkout tags/v${GDRCOPY_VERSION} && \ + make prefix=${GDRCOPY_HOME} lib_install exes_install && \ + ldconfig && \ + rm -rf /tmp/gdrcopy + +ENV PATH=${GDRCOPY_HOME}/bin:${PATH} +ENV LD_LIBRARY_PATH=${GDRCOPY_HOME}/lib:${LD_LIBRARY_PATH} +ENV CPATH=${GDRCOPY_HOME}/include:${CPATH} +ENV LIBRARY_PATH=${GDRCOPY_HOME}/lib:${LIBRARY_PATH} + +# --- Build and Install UCX from Source --- +RUN cd /tmp \ + && wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz \ + && tar -zxf ucx-${UCX_VERSION}.tar.gz \ + && cd ucx-${UCX_VERSION} \ + && ./contrib/configure-release \ + --prefix=${UCX_HOME} \ + --with-cuda=${CUDA_HOME} \ + --with-gdrcopy=${GDRCOPY_HOME} \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-verbs \ + --with-dm \ + --enable-mt \ + && make -j$(nproc) && make install-strip \ + && rm -rf /tmp/ucx-${UCX_VERSION}* + +ENV PATH=${UCX_HOME}/bin:${PATH} +ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH} +ENV CPATH=${UCX_HOME}/include:${CPATH} +ENV LIBRARY_PATH=${UCX_HOME}/lib:${LIBRARY_PATH} +ENV PKG_CONFIG_PATH=${UCX_HOME}/lib/pkgconfig:${PKG_CONFIG_PATH} + +# --- Build and Install NVSHMEM from Source --- +ENV MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi +ENV CPATH=${MPI_HOME}/include:${CPATH} +RUN cd /tmp \ + && wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda${CUDA_MAJOR}-all-all-${NVSHMEM_VERSION}.tar.gz \ + && tar -xzf nvshmem_src_cuda${CUDA_MAJOR}-all-all-${NVSHMEM_VERSION}.tar.gz \ + && cd nvshmem_src \ + && mkdir -p build \ + && cd build \ + && cmake \ + -G Ninja \ + -DNVSHMEM_PREFIX=${NVSHMEM_PREFIX} \ + -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} \ + -DNVSHMEM_PMIX_SUPPORT=0 \ + -DNVSHMEM_LIBFABRIC_SUPPORT=0 \ + -DNVSHMEM_IBRC_SUPPORT=1 \ + -DNVSHMEM_IBGDA_SUPPORT=1 \ + -DNVSHMEM_IBDEVX_SUPPORT=1 \ + -DNVSHMEM_SHMEM_SUPPORT=0 \ + -DNVSHMEM_USE_GDRCOPY=1 \ + -DNVSHMEM_USE_NCCL=0 \ + -DNVSHMEM_BUILD_TESTS=0 \ + -DNVSHMEM_BUILD_EXAMPLES=0 \ + -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ + -DLIBFABRIC_HOME=/usr \ + -DGDRCOPY_HOME=${GDRCOPY_HOME} \ + -DNVSHMEM_MPI_SUPPORT=0 \ + -DNVSHMEM_DISABLE_CUDA_VMM=1 \ + .. \ + && ninja -j$(nproc) \ + && ninja -j$(nproc) install \ + && rm -rf /tmp/nvshmem_src* + +ENV PATH=${NVSHMEM_PREFIX}/bin:${PATH} +ENV LD_LIBRARY_PATH=${NVSHMEM_PREFIX}/lib:${LD_LIBRARY_PATH} +ENV CPATH=${NVSHMEM_PREFIX}/include:${CPATH} +ENV LIBRARY_PATH=${NVSHMEM_PREFIX}/lib:${LIBRARY_PATH} +ENV PKG_CONFIG_PATH=${NVSHMEM_PREFIX}/lib/pkgconfig:${PKG_CONFIG_PATH} + +# Install UV, dependencies and NIXL (python) +SHELL ["/bin/bash", "-ec"] +RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin/" sh && \ + # Python / toolchain + VENV_PATH="${VENV_PATH:-/app/venv}" && \ + PYTHON_VERSION="${PYTHON_VERSION:-3.12}" && \ + PYTHON_COMMAND="${PYTHON_COMMAND:-python${PYTHON_VERSION}}" && \ + PY_TAG="${PYTHON_VERSION//./}" && \ + UV="${UV_INSTALL_PATH:-/usr/local/bin/uv}" && \ + PYTHON="${VENV_PATH}/bin/python" && \ + PATH="${VENV_PATH}/bin:${PATH}" && \ + "${UV}" venv "${VENV_PATH}" && \ + # Base dependencies + upip() { "${UV}" pip install --python "${PYTHON}" --no-progress --no-cache-dir --torch-backend=cu${CUDA_MAJOR}${CUDA_MINOR} "$@"; } && \ + upip pandas datasets rust-just regex setuptools-scm cmake && \ + upip nixl "nvshmem4py-cu${CUDA_MAJOR}" cuda-python && \ + # PIP in venv so 'python -m pip' works inside DeepEP build step + "${PYTHON}" -m ensurepip --upgrade && \ + "${PYTHON}" -m pip install -U pip wheel setuptools && \ + # Clone and change directory + git_clone_and_cd() { local url=$1 dir=$2 branch=${3:-main} commit=${4:-}; git clone --depth=1 --branch "${branch}" "${url}" "${dir}"; if [[ -n "${commit}" ]]; then git -C "${dir}" fetch --unshallow origin "${branch}"; git -C "${dir}" checkout "${commit}"; fi; git config --global url."https://github.com/".insteadOf "git@github.com:"; git -C "${dir}" submodule update --init --recursive; cd "${dir}"; } && \ + # DeepEP + git_clone_and_cd https://github.com/deepseek-ai/DeepEP /app/deepep main 9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee && \ + NVSHMEM_DIR="${NVSHMEM_PREFIX:-/opt/nvshmem}" "${PYTHON}" -m pip install --no-build-isolation --no-cache-dir . && \ + BUILD_DIR="build/lib.linux-$(uname -m)-cpython-${PY_TAG}" && \ + SO_NAME="deep_ep_cpp.cpython-${PY_TAG}-$(uname -m)-linux-gnu.so" && \ + [[ -f "${BUILD_DIR}/${SO_NAME}" ]] && ln -sf "${BUILD_DIR}/${SO_NAME}" . && \ + # DeepGEMM + git_clone_and_cd https://github.com/deepseek-ai/DeepGEMM /app/deepgemm main ea9c5d92 && \ + "${UV}" pip uninstall --python "${PYTHON}" deep_gemm && \ + ./install.sh && \ + # FlashInfer + upip flashinfer-python && \ + # vLLM + git_clone_and_cd https://github.com/vllm-project/vllm.git /app/vllm releases/v0.11.0 && \ + VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/f71952c1c49fb86686b0b300b727b26282362bf4/vllm-0.11.0%2Bcu129-cp38-abi3-manylinux1_x86_64.whl VLLM_USE_PRECOMPILED=1 upip . + +ENTRYPOINT ["/app/code/venv/bin/vllm", "serve"] diff --git a/k8s/docker/Dockerfile.xpu b/k8s/docker/Dockerfile.xpu new file mode 100644 index 000000000..b163bb717 --- /dev/null +++ b/k8s/docker/Dockerfile.xpu @@ -0,0 +1,187 @@ +ARG ONEAPI_VERSION=2025.1.3-0 + +# ============================================================================ +# BUILD STAGE - Install build dependencies and create wheels +# ============================================================================ +FROM intel/deep-learning-essentials:${ONEAPI_VERSION}-devel-rockylinux9 AS builder + +ARG ONEAPI_VERSION=2025.1.3-0 +ARG PYTHON_VERSION +ARG VLLM_VERSION=v0.11.0 + +WORKDIR /workspace + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + UV_LINK_MODE=copy \ + PYTHON_VERSION=${PYTHON_VERSION:-3.12} \ + VIRTUAL_ENV=/opt/vllm \ + SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 \ + SYCL_CACHE_PERSISTENT=1 \ + VLLM_TARGET_DEVICE=xpu + + +# Update base packages +#RUN dnf update -y && dnf clean all + +# Install base packages and EPEL in single layer +RUN dnf install -y dnf-plugins-core && \ + dnf config-manager --enable crb && \ + dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + dnf config-manager --set-enabled epel && \ + dnf install -y --allowerasing \ + python${PYTHON_VERSION} python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \ + python${PYTHON_VERSION}-devel \ + python3.9-devel \ + which procps findutils tar \ + gcc gcc-c++ \ + make cmake \ + autoconf automake libtool \ + git \ + curl wget \ + gzip \ + zlib-devel \ + openssl-devel \ + pkg-config \ + libuuid-devel \ + glibc-devel \ + rdma-core-devel \ + numactl-libs \ + subunit \ + pciutils \ + pciutils-libs \ + ninja-build \ + gh \ + && dnf clean all + +# Setup Python virtual environment +RUN python${PYTHON_VERSION} -m venv /opt/vllm && \ + ${VIRTUAL_ENV}/bin/pip install --no-cache -U pip wheel uv meson-python ninja pybind11 build + +ENV LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64" \ +CPATH="/usr/include:/usr/local/include" \ +PKG_CONFIG_PATH="/usr/lib/pkgconfig:/usr/local/lib/pkgconfig:/usr/local/lib64/pkgconfig" + +# Pin torch, so all deps are built against the same version +# as vllm itself +RUN --mount=type=cache,target=/root/.cache/uv \ + source ${VIRTUAL_ENV}/bin/activate && \ + uv pip install \ + # global + numpy \ + # nixl + pyyaml \ + types-PyYAML \ + pytest \ + patchelf>=0.11.0 + + +RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/local.conf && \ + echo "/usr/local/lib64" >> /etc/ld.so.conf.d/local.conf && \ + ldconfig + +WORKDIR /workspace + +#RUN mkdir -p /wheels +# Define commit SHAs as build args to avoid layer invalidation +#ARG LMCACHE_COMMIT_SHA=c1563bc9c72ea0d71156a3d9a6cd643170828acf + +# Clone repositories with cache mounts +# RUN --mount=type=cache,target=/var/cache/git \ +# git clone https://github.com/neuralmagic/LMCache.git && \ +# cd LMCache && \ +# git checkout -q $LMCACHE_COMMIT_SHA && \ +# cd .. && \ +# # Build LMCache wheel +# cd LMCache && \ +# source ${VIRTUAL_ENV}/bin/activate && \ +# NO_CUDA_EXT=1 python -m build --wheel --no-isolation -o /wheels && \ +# cd .. + + +# Use existing virtual environment at /opt/vllm +WORKDIR /workspace/ + + +# Clone vLLM and build for XPU following official documentation +RUN --mount=type=cache,target=/var/cache/git \ + --mount=type=bind,source=.git,target=.git \ + git clone https://github.com/vllm-project/vllm.git && \ + cd vllm && \ + git checkout ${VLLM_VERSION} && \ + source ${VIRTUAL_ENV}/bin/activate && \ + pip install -v -r requirements/xpu.txt && \ + export VLLM_TARGET_DEVICE=xpu && \ + python setup.py install && \ + cd /workspace && rm -rf vllm + +# ============================================================================ +# RUNTIME STAGE - Minimal runtime image +# ============================================================================ +FROM intel/deep-learning-essentials:${ONEAPI_VERSION}-devel-rockylinux9 AS runtime + +ARG PYTHON_VERSION + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + UV_LINK_MODE=copy \ + PYTHON_VERSION=${PYTHON_VERSION:-3.12} \ + VIRTUAL_ENV=/opt/vllm \ + SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 \ + SYCL_CACHE_PERSISTENT=1 \ + VLLM_TARGET_DEVICE=xpu + +# Install only runtime dependencies +RUN dnf install -y --allowerasing \ + python${PYTHON_VERSION} python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-devel \ + rdma-core-devel \ + numactl-libs \ + pciutils \ + procps-ng \ + git \ + curl \ + gcc && dnf clean all + +# Setup ldconfig and library paths +RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/local.conf && \ + echo "/usr/local/lib64" >> /etc/ld.so.conf.d/local.conf && \ + ldconfig + +# Copy the complete virtual environment from builder +COPY --from=builder /opt/vllm /opt/vllm + + +# Install all packages +RUN --mount=type=cache,target=/var/cache/git \ + source /opt/vllm/bin/activate && \ + \ + # Install PyTorch and cuda-python + uv pip install huggingface_hub[hf_xet] && \ + uv pip install nixl==0.3.0 + +RUN dnf remove -y git && dnf autoremove -y && dnf clean all + +# setup non-root user for OpenShift with GPU access +# RUN umask 002 && \ +# # Create render group for GPU access (if not exists) +# groupadd -r render || true && \ +# groupadd -r video || true && \ +# # Add user to groups needed for GPU access +# useradd --uid 2000 --gid 0 --groups render,video vllm && \ +# rm -rf /home/vllm && \ +# mkdir -p /home/vllm && \ +# chown vllm:root /home/vllm && \ +# chmod g+rwx /home/vllm + + +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" \ + VLLM_USAGE_SOURCE=production-docker-image \ + TRITON_XPU_PROFILE=1 \ + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + VLLM_TARGET_DEVICE=xpu + + +# USER 2000 +# WORKDIR /home/vllm + +ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/k8s/guides/QUICKSTART.md b/k8s/guides/QUICKSTART.md new file mode 100644 index 000000000..7e5a3093e --- /dev/null +++ b/k8s/guides/QUICKSTART.md @@ -0,0 +1,96 @@ +# llm-d Quick Start + +## Overview + +This quick start will walk you through the steps to install and deploy llm-d on a Kubernetes cluster and explain some of the key choices at each step as well as how to validate and remove your deployment. + +## Prerequisites + +### Run with sufficient permissions to deploy + +Before running any deployment, ensure you have sufficient permissions to deploy new custom resource definitions (CRDs) and alter roles. Our guides are written for cluster administrators, especially for the prerequisites. Once prerequisites are configured, deploying model servers and new InferencePools typically requires only namespace editor permissions. + +> [!IMPORTANT] +> llm-d recommends separating infrastructure configuration -- like the inference gateway -- from workload deployment. Inference platform administrators are responsible for managing the cluster and dependencies while inference workload owners deploy and manage the lifecycle of the self-hosted model servers. +> +> The separation between these roles depends on the number of workloads present in your environment. A single production workload might see the same team managing all the software. In a large Internal Model as a Service deployment, the platform team might manage shared inference gateways and allow individual workload teams to directly manage the configuration and deployment of large model servers. See [the Inference Gateway docs](https://gateway-api-inference-extension.sigs.k8s.io/concepts/roles-and-personas/) for more examples of the role archetypes. + +### Tool Dependencies + +You will need to install some dependencies (like kubectl, helm, yq, git, etc.) and have a HuggingFace token for most examples. We have documented these requirements and instructions in the [prereq/client-setup directory](./prereq/client-setup/README.md). To install the dependencies, use the provided [install-deps.sh](./prereq/client-setup/install-deps.sh) script. + +> [!IMPORTANT] +> We anticipate that almost all production deployments will leverage configuration management automation, GitOps, or CI/CD pipelines to automate repeatable deployments. Most users have an opinion about how to deploy workloads and there is high variation in the needs of the model server deployment. llm-d therefore minimizes the amount of tooling and parameterization in our guides and prioritizes demonstrating complete examples and concepts to allow you to adapt our configuration to your use case. + +### HuggingFace Token + +A HuggingFace token is required to download models from the HuggingFace Hub. You must create a Kubernetes secret containing your HuggingFace token in the target namespace before deployment, see [instructions](./prereq/client-setup/README.md#huggingface-token). + +> [!IMPORTANT] +> vLLM by default will load models from HuggingFace as needed. Since in production environments downloading models is a source of startup latency and a potential point of failure (if the model provider is down), most deployments should cache downloads across multiple restarts and host copies of their models within the same failure domain as their replicas. + +### Configuring necessary infrastructure and your cluster + +llm-d can be deployed on a variety of Kubernetes distributions and managed providers. The [infrastructure prerequisite](./prereq/infrastructure/README.md) will help you ensure your cluster is properly configured with the resources necessary to run LLM inference. + +Specific requirements, workarounds, and any other documentation relevant to these platforms can be reviewed in the [infra-providers directory](../docs/infra-providers/). + +### Gateway provider + +llm-d integrates with the [Kubernetes Gateway API](https://gateway-api.sigs.k8s.io/) to optimize load balancing to your model server replicas and have access to the full set of service management features you are likely to need in production, such as traffic splitting and authentication / authorization. + +You must select an [appropriate Gateway implementation for your infrastructure and deploy the Gateway control plane and its prerequisite CRDs](./prereq/gateway-provider/README.md). + +> [!IMPORTANT] +> We recommend selecting a Gateway implementation provided by your infrastructure, if available. If not, we test and verify our guides with both [kgateway](https://kgateway.dev/docs/main/quickstart/) and [istio](https://istio.io/latest/docs/setup/getting-started/). + +## Deployment + +Select an appropriate guide from the list in the [README.md](./README.md). + +> [!IMPORTANT] +> We recommend starting with the [inference scheduling](./inference-scheduling/README.md) well-lit path if you are looking to deploy vLLM in a recommended production serving configuration. Use of an intelligent load balancer is broadly applicable to all environments and streamlines gathering the most critical operational metrics. + +Navigate to the desired guide directory and follow its README instructions. For example: + +```bash +cd quickstarts/guides/inference-scheduling # Navigate to your desired example directory +# Follow the README.md instructions in the example directory +``` + +When you complete the deployment successfully, return here. + +### Validation + +You should be able to list all Helm releases to view the charts installed by the guide: + +```bash +helm list -n ${NAMESPACE} +``` + +You can view all resources in your namespace with: + +```bash +kubectl get all -n ${NAMESPACE} +``` + +**Note:** This assumes no other guide deployments in your given `${NAMESPACE}`. + +### Making inference requests to your deployments + +For instructions on getting started with making inference requests, see [getting-started-inferencing.md](../docs/getting-started-inferencing.md). + +### Metrics collection + +llm-d charts include support for metrics collection from vLLM pods. llm-d applies PodMonitors to trigger Prometheus +scrape targets when enabled with the appropriate Helm chart values. See [MONITORING.md](../docs/monitoring/README.md) for details. + +In Kubernetes, Prometheus and Grafana can be installed from the prometheus-community +[kube-prometheus-stack helm charts](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack). In OpenShift, the built-in user workload monitoring Prometheus stack can be utilized to collect metrics. + +> [!IMPORTANT] +> We strongly recommend enabling monitoring and observability of llm-d components. LLM inference can bottleneck in multiple ways and troubleshooting performance may involve inspecting gateway, vLLM, OS, and hardware level metrics. + +### Uninstall + +To remove llm-d resources from the cluster, refer to the uninstallation instructions in your selected guide README. diff --git a/k8s/guides/README.md b/k8s/guides/README.md new file mode 100644 index 000000000..8b6ad3b15 --- /dev/null +++ b/k8s/guides/README.md @@ -0,0 +1,43 @@ +# High performance distributed inference on Kubernetes with llm-d + +Our guides provide tested and benchmarked recipes and Helm charts to serve large language models (LLMs) at peak performance with best practices common to production deployments. A familiarity with basic deployment and operation of Kubernetes is assumed. + +> [!TIP] +> If you want to learn by doing, follow a [step-by-step first deployment with QUICKSTART.md](./QUICKSTART.md). + +## Who are these guides (and llm-d) for? + +These guides are targeted at startups and enterprises deploying production LLM serving that want the best possible performance while minimizing operational complexity. State of the art LLM inference involves multiple optimizations that offer meaningful tradeoffs, depending on use case. The guides help identify those key optimizations, understand their tradeoffs, and verify the gains against your own workload. + +We focus on the following use cases: + +* Deploying a self-hosted LLM behind a single workload across tens or hundreds of nodes +* Running a production model-as-a-service platform that supports many users and workloads sharing one or more LLM deployments + +## Well-Lit Path Guides + +A well-lit path is a documented, tested, and benchmarked solution of choice to reduce adoption risk and maintenance cost. These are the central best practices common to production deployments of large language model serving. + +We currently offer three tested and benchmarked paths to help you deploy large models: + +1. [Intelligent Inference Scheduling](./inference-scheduling/README.md) - Deploy [vLLM](https://docs.vllm.ai) behind the [Inference Gateway (IGW)](https://github.com/kubernetes-sigs/gateway-api-inference-extension) to decrease latency and increase throughput via [precise prefix-cache aware routing](./precise-prefix-cache-aware/README.md) and [customizable scheduling policies](https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md). +2. [Prefill/Decode Disaggregation](./pd-disaggregation/README.md) - Reduce time to first token (TTFT) and get more predictable time per output token (TPOT) by splitting inference into prefill servers handling prompts and decode servers handling responses, primarily on large models such as Llama-70B and when processing very long prompts. +3. [Wide Expert-Parallelism](./wide-ep-lws/README.md) - Deploy very large Mixture-of-Experts (MoE) models like [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) and significantly reduce end-to-end latency and increase throughput by scaling up with [Data Parallelism and Expert Parallelism](https://docs.vllm.ai/en/latest/serving/data_parallel_deployment.html) over fast accelerator networks. + +> [!IMPORTANT] +> These guides are intended to be a starting point for your own configuration and deployment of model servers. Our Helm charts provide basic reusable building blocks for vLLM deployments and inference scheduler configuration within these guides but will not support the full range of all possible configurations. Both guides and charts depend on features provided and supported in the [vLLM](https://github.com/vllm-project/vllm) and [inference gateway](https://github.com/kubernetes-sigs/gateway-api-inference-extension) open source projects. + +## Supporting Guides + +Our supporting guides address common operational challenges with model serving at scale: + +- [Simulating model servers](./simulated-accelerators/README.md) can deploy a vLLM model server simulator that allows testing inference scheduling and orchestration at scale as each instance does not need accelerators. + +## Other Guides + +The following guides have been provided by the community but do not fully integrate into the llm-d configuration structure yet and are not fully supported as well-lit paths: + +* Coming Soon! + +> [!NOTE] +> New guides added to this list enable at least one of the core well-lit paths but may directly include prerequisite steps specific to new hardware or infrastructure providers without full abstraction. A guide added here is expected to eventually become path of an existing well-lit path. \ No newline at end of file diff --git a/k8s/guides/inference-scheduling/README.md b/k8s/guides/inference-scheduling/README.md new file mode 100644 index 000000000..bd1af5705 --- /dev/null +++ b/k8s/guides/inference-scheduling/README.md @@ -0,0 +1,173 @@ +# Well-lit Path: Intelligent Inference Scheduling + +## Overview + +This guide deploys the recommended out of the box [scheduling configuration](https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md) for most vLLM deployments, reducing tail latency and increasing throughput through load-aware and prefix-cache aware balancing. This can be run on a single GPU that can load [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B). + +This profile defaults to the approximate prefix cache aware scorer, which only observes request traffic to predict prefix cache locality. The [precise prefix cache aware routing feature](../precise-prefix-cache-aware) improves hit rate by introspecting the vLLM instances for cache entries and will become the default in a future release. + +## Hardware Requirements + +This example out of the box requires 2 Nvidia GPUs of any kind (support determined by the inferencing image used). + +## Prerequisites + +- Have the [proper client tools installed on your local system](../prereq/client-setup/README.md) to use this guide. +- Ensure your cluster infrastructure is sufficient to [deploy high scale inference](../prereq/infrastructure) +- Configure and deploy your [Gateway control plane](../prereq/gateway-provider/README.md). +- [Create the `llm-d-hf-token` secret in your target namespace with the key `HF_TOKEN` matching a valid HuggingFace token](../prereq/client-setup/README.md#huggingface-token) to pull models. +- Have the [Monitoring stack](../../docs/monitoring/README.md) installed on your system. + +## Installation + +Use the helmfile to compose and install the stack. The Namespace in which the stack will be deployed will be derived from the `${NAMESPACE}` environment variable. If you have not set this, it will default to `llm-d-inference-scheduler` in this example. + +```bash +export NAMESPACE=llm-d-inference-scheduler # or any other namespace +cd guides/inference-scheduling +helmfile apply -n ${NAMESPACE} +``` + +**_NOTE:_** You can set the `$RELEASE_NAME_POSTFIX` env variable to change the release names. This is how we support concurrent installs. Ex: `RELEASE_NAME_POSTFIX=inference-scheduling-2 helmfile apply -n ${NAMESPACE}` + +**_NOTE:_** This uses Istio as the default provider, see [Gateway Options](./README.md#gateway-options) for installing with a specific provider. + +### Gateway and Hardware Options + +#### Gateway Options + +To see specify your gateway choice you can use the `-e ` flag, ex: + +```bash +helmfile apply -e kgateway -n ${NAMESPACE} +``` + + +For DigitalOcean Kubernetes Service (DOKS): + +```bash +helmfile apply -e digitalocean -n ${NAMESPACE} +``` + +**Note:** DigitalOcean deployment uses public Qwen/Qwen3-0.6B model (no HuggingFace token required) and is optimized for DOKS GPU nodes with automatic tolerations and node selectors. Gateway API v1 compatibility fixes are automatically included. + +To see what gateway options are supported refer to our [gateway provider prereq doc](../prereq/gateway-provider/README.md#supported-providers). Gateway configurations per provider are tracked in the [gateway-configurations directory](../prereq/gateway-provider/common-configurations/). + +You can also customize your gateway, for more information on how to do that see our [gateway customization docs](../../docs/customizing-your-gateway.md). + +#### Hardware Backends + +Currently in the `inference-scheduling` example we suppport configurations for `xpu`, `tpu` and `cuda` GPUs. By default we use modelserver values supporting `cuda` GPUs, but to deploy on one of the other speciality hardware backends you may use: + +```bash +helmfile apply -e xpu -n ${NAMESPACE} # targets istio as gateway provider with XPU hardware +# or +helmfile apply -e gke_tpu -n ${NAMESPACE} # targets GKE externally managed as gateway provider with TPU hardware +``` + +### Install HTTPRoute + +Follow provider specific instructions for installing HTTPRoute. + +#### Install for "kgateway" or "istio" + +```bash +kubectl apply -f httproute.yaml -n ${NAMESPACE} +``` + +#### Install for "gke" + +```bash +kubectl apply -f httproute.gke.yaml -n ${NAMESPACE} +``` + +#### Install for "digitalocean" + +```bash +kubectl apply -f httproute.yaml -n ${NAMESPACE} +``` +## Verify the Installation + +- Firstly, you should be able to list all helm releases to view the 3 charts got installed into your chosen namespace: + +```bash +helm list -n ${NAMESPACE} +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gaie-inference-scheduling llm-d-inference-scheduler 1 2025-08-24 11:24:53.231918 -0700 PDT deployed inferencepool-v1.0.1 v1.0.1 +infra-inference-scheduling llm-d-inference-scheduler 1 2025-08-24 11:24:49.551591 -0700 PDT deployed llm-d-infra-v1.3.3 v0.3.0 +ms-inference-scheduling llm-d-inference-scheduler 1 2025-08-24 11:24:58.360173 -0700 PDT deployed llm-d-modelservice-v0.2.9 v0.2.0 +``` + +- Out of the box with this example you should have the following resources: + +```bash +kubectl get all -n ${NAMESPACE} +NAME READY STATUS RESTARTS AGE +pod/gaie-inference-scheduling-epp-f8fbd9897-cxfvn 1/1 Running 0 3m59s +pod/infra-inference-scheduling-inference-gateway-istio-6787675b9swc 1/1 Running 0 4m3s +pod/ms-inference-scheduling-llm-d-modelservice-decode-8ff7fd5b58lw9 2/2 Running 0 3m55s +pod/ms-inference-scheduling-llm-d-modelservice-decode-8ff7fd5bt5f9s 2/2 Running 0 3m55s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/gaie-inference-scheduling-epp ClusterIP 10.16.3.151 9002/TCP,9090/TCP 3m59s +service/gaie-inference-scheduling-ip-18c12339 ClusterIP None 54321/TCP 3m59s +service/infra-inference-scheduling-inference-gateway-istio LoadBalancer 10.16.1.195 10.16.4.2 15021:30274/TCP,80:32814/TCP 4m3s + +NAME READY UP-TO-DATE AVAILABLE AGE +deployment.apps/gaie-inference-scheduling-epp 1/1 1 1 4m +deployment.apps/infra-inference-scheduling-inference-gateway-istio 1/1 1 1 4m4s +deployment.apps/ms-inference-scheduling-llm-d-modelservice-decode 2/2 2 2 3m56s + +NAME DESIRED CURRENT READY AGE +replicaset.apps/gaie-inference-scheduling-epp-f8fbd9897 1 1 1 4m +replicaset.apps/infra-inference-scheduling-inference-gateway-istio-678767549 1 1 1 4m4s +replicaset.apps/ms-inference-scheduling-llm-d-modelservice-decode-8ff7fd5b8 2 2 2 3m56s +``` + +**_NOTE:_** This assumes no other guide deployments in your given `${NAMESPACE}` and you have not changed the default release names via the `${RELEASE_NAME}` environment variable. + +## Using the stack + +For instructions on getting started making inference requests see [our docs](../../docs/getting-started-inferencing.md) + +## Cleanup + +To remove the deployment: + +```bash +# From examples/inference-scheduling +helmfile destroy -n ${NAMESPACE} + +# Or uninstall manually +helm uninstall infra-inference-scheduling -n ${NAMESPACE} +helm uninstall gaie-inference-scheduling -n ${NAMESPACE} +helm uninstall ms-inference-scheduling -n ${NAMESPACE} +``` + +**_NOTE:_** If you set the `$RELEASE_NAME_POSTFIX` environment variable, your release names will be different from the command above: `infra-$RELEASE_NAME_POSTFIX`, `gaie-$RELEASE_NAME_POSTFIX` and `ms-$RELEASE_NAME_POSTFIX`. + +### Cleanup HTTPRoute + +Follow provider specific instructions for deleting HTTPRoute. + +#### Cleanup for "kgateway" or "istio" + +```bash +kubectl delete -f httproute.yaml -n ${NAMESPACE} +``` + +#### Cleanup for "gke" + +```bash +kubectl delete -f httproute.gke.yaml -n ${NAMESPACE} +``` + +#### Cleanup for "digitalocean" + +```bash +kubectl delete -f httproute.yaml -n ${NAMESPACE} +``` + +## Customization + +For information on customizing a guide and tips to build your own, see [our docs](../../docs/customizing-a-guide.md) diff --git a/k8s/guides/inference-scheduling/gaie-inference-scheduling/values.yaml b/k8s/guides/inference-scheduling/gaie-inference-scheduling/values.yaml new file mode 100644 index 000000000..1ac83384b --- /dev/null +++ b/k8s/guides/inference-scheduling/gaie-inference-scheduling/values.yaml @@ -0,0 +1,31 @@ +inferenceExtension: + replicas: 1 + image: + # both downstream infernece-scheduler and upstream epp image can support inference-scheduling example + ################### + name: llm-d-inference-scheduler + hub: ghcr.io/llm-d + tag: v0.3.2 + ################### + # name: epp + # hub: registry.k8s.io/gateway-api-inference-extension + # tag: v1.0.1 + ################### + pullPolicy: Always + extProcPort: 9002 + pluginsConfigFile: "default-plugins.yaml" + monitoring: + interval: "10s" + # Service account token secret for authentication + secret: + name: inference-scheduling-gateway-sa-metrics-reader-secret + # Prometheus ServiceMonitor will be created when enabled for EPP metrics collection + prometheus: + enabled: true +inferencePool: + apiVersion: inference.networking.x-k8s.io/v1alpha2 # use old API version for inference + targetPortNumber: 8000 + modelServerType: vllm + modelServers: + matchLabels: + llm-d.ai/inferenceServing: "true" diff --git a/k8s/guides/inference-scheduling/helmfile.yaml.gotmpl b/k8s/guides/inference-scheduling/helmfile.yaml.gotmpl new file mode 100644 index 000000000..11269877a --- /dev/null +++ b/k8s/guides/inference-scheduling/helmfile.yaml.gotmpl @@ -0,0 +1,125 @@ +environments: + istio: &I + values: + - ../prereq/gateway-provider/common-configurations/istio.yaml + istioBench: &IB + values: + - ../prereq/gateway-provider/common-configurations/istio.yaml + - ../prereq/gateway-provider/common-configurations/benchmarking.yaml + kgateway: &KG + values: + - ../prereq/gateway-provider/common-configurations/kgateway.yaml + gke: &GKE + values: + - ../prereq/gateway-provider/common-configurations/gke.yaml + gke_tpu: &GKE_TPU + values: + - ../prereq/gateway-provider/common-configurations/gke_tpu.yaml + xpu: &XPU + <<: *I + digitalocean: &DO + <<: *I + default: &DEFAULT + <<: *I + +--- + +{{- $ns := .Namespace | default "llm-d-inference-scheduling" -}} +{{- $rn := (env "RELEASE_NAME_POSTFIX") | default "inference-scheduling" -}} + +repositories: + - name: llm-d-modelservice + url: https://llm-d-incubation.github.io/llm-d-modelservice/ + - name: llm-d-infra + url: https://llm-d-incubation.github.io/llm-d-infra/ + +releases: + - name: {{ printf "infra-%s" $rn | quote }} + namespace: {{ $ns }} + chart: llm-d-infra/llm-d-infra + version: v1.3.3 + installed: true + labels: + type: infrastructure + kind: inference-stack + values: + - gateway: + {{ .Environment.Values.gateway | toYaml | nindent 10 }} + + - name: {{ printf "gaie-%s" $rn | quote }} + namespace: {{ $ns }} + chart: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + version: v1.0.1 + installed: true + needs: + - {{ printf "infra-%s" $rn | quote }} + values: + - gaie-inference-scheduling/values.yaml + # Apply provider name if on GKE + {{- if or (eq .Environment.Name "gke") (eq .Environment.Name "gke_tpu") }} + - provider: + name: {{ .Environment.Values.provider.name }} + inferencePool: + apiVersion: {{ .Environment.Values.inferencePool.apiVersion }} + inferenceExtension: + monitoring: + gke: + enabled: true + prometheus: + enabled: false + {{- end }} + # Apply destination rule for anything istio + {{- if or (eq .Environment.Name "istio") (eq .Environment.Name "default") (eq .Environment.Name "istioBench") (eq .Environment.Name "xpu") }} + - provider: + name: {{ .Environment.Values.provider.name }} + - istio: + {{ .Environment.Values.istio | toYaml | nindent 10 }} + - istio: + destinationRule: + host: {{ printf "gaie-%s-epp.%s.svc.cluster.local" $rn $ns | quote }} + {{- end }} + # Apply log level only in bench setting + {{- if (eq .Environment.Name "istioBench") }} + - inferenceExtension: + flags: + {{ .Environment.Values.inferenceExtension.flags | toYaml | nindent 12 }} + {{- end }} + labels: + kind: inference-stack + + - name: {{ printf "ms-%s" $rn | quote }} + namespace: {{ $ns }} + chart: llm-d-modelservice/llm-d-modelservice + version: v0.2.10 + installed: true + needs: + - {{ printf "infra-%s" $rn | quote }} + - {{ printf "gaie-%s" $rn | quote }} + values: + {{- if eq .Environment.Name "gke_tpu" }} + - ms-inference-scheduling/values_tpu.yaml + {{- else if eq .Environment.Name "xpu" }} + - ms-inference-scheduling/values_xpu.yaml + {{- else if eq .Environment.Name "digitalocean" }} + - ms-inference-scheduling/digitalocean-values.yaml + {{- else if eq .Environment.Name "xpu" }} + - ms-inference-scheduling/values_xpu.yaml + {{- else }} + - ms-inference-scheduling/values.yaml + {{- end }} + {{- if (eq .Environment.Name "istioBench") }} + - routing: + {{ .Environment.Values.routing | toYaml | nindent 10 }} + {{- end }} + set: + # apply release name derived values + - name: "routing.inferencePool.name" + value: {{ printf "gaie-%s" $rn | quote }} + - name: "routing.parentRefs[0].name" + value: {{ printf "infra-%s-inference-gateway" $rn | quote }} + {{- if or (eq .Environment.Name "gke") (eq .Environment.Name "gke_tpu") }} + - name: "decode.monitoring.podmonitor.enabled" + value: false + {{- end }} + labels: + kind: inference-stack diff --git a/k8s/guides/inference-scheduling/httproute.gke.yaml b/k8s/guides/inference-scheduling/httproute.gke.yaml new file mode 100644 index 000000000..95fd1d1bb --- /dev/null +++ b/k8s/guides/inference-scheduling/httproute.gke.yaml @@ -0,0 +1,19 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-d-inference-scheduling +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-inference-scheduling-inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: gaie-inference-scheduling + weight: 1 + matches: + - path: + type: PathPrefix + value: / diff --git a/k8s/guides/inference-scheduling/httproute.yaml b/k8s/guides/inference-scheduling/httproute.yaml new file mode 100644 index 000000000..24c401365 --- /dev/null +++ b/k8s/guides/inference-scheduling/httproute.yaml @@ -0,0 +1,23 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-d-inference-scheduling +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-inference-scheduling-inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: gaie-inference-scheduling + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + matches: + - path: + type: PathPrefix + value: / diff --git a/k8s/guides/inference-scheduling/ms-inference-scheduling/digitalocean-values.yaml b/k8s/guides/inference-scheduling/ms-inference-scheduling/digitalocean-values.yaml new file mode 100644 index 000000000..1f8ac1154 --- /dev/null +++ b/k8s/guides/inference-scheduling/ms-inference-scheduling/digitalocean-values.yaml @@ -0,0 +1,89 @@ +# DigitalOcean Inference Scheduling Override +# This file contains ONLY the values that need to be overridden for DigitalOcean DOKS +# Architecture: 2 Decode Pods for intelligent scheduling (no P/D disaggregation) + +# DigitalOcean-specific model configuration (smaller model for DOKS) +modelArtifacts: + uri: "hf://Qwen/Qwen3-0.6B" # Smaller model that doesn't require HF token + name: "Qwen/Qwen3-0.6B" + size: 8Gi + authSecretName: "" # No HF token required for this model + +routing: + modelName: Qwen/Qwen3-0.6B # Match the model name + httpRoute: + create: false + +# DigitalOcean-specific container configuration +decode: + replicas: 2 + containers: + - name: "vllm" + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 + modelCommand: vllmServe # Required by chart + args: + - "--enforce-eager" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + - "--gpu-memory-utilization" # DigitalOcean GPU optimization + - "0.85" + - "--max-model-len" # Optimized for smaller model + - "4096" + env: + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + ports: + - containerPort: 5557 + protocol: TCP + - containerPort: 8200 + name: metrics + protocol: TCP + resources: # Required for DOKS GPU nodes + limits: + memory: 16Gi + cpu: "4" + nvidia.com/gpu: "1" + requests: + memory: 16Gi + cpu: "4" + nvidia.com/gpu: "1" + mountModelVolume: true + volumeMounts: + - name: shm + mountPath: /dev/shm + - name: metrics-volume + mountPath: /.config + - name: torch-compile-cache + mountPath: /.cache + volumes: + - name: shm + emptyDir: + medium: Memory + sizeLimit: "4Gi" # Optimized for DigitalOcean + - name: metrics-volume + emptyDir: {} + - name: torch-compile-cache + emptyDir: {} + # IMPORTANT: DigitalOcean GPU node tolerations + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + +# Prefill disabled in inference-scheduling scenario +prefill: + create: false diff --git a/k8s/guides/inference-scheduling/ms-inference-scheduling/values.yaml b/k8s/guides/inference-scheduling/ms-inference-scheduling/values.yaml new file mode 100644 index 000000000..04388b1cd --- /dev/null +++ b/k8s/guides/inference-scheduling/ms-inference-scheduling/values.yaml @@ -0,0 +1,80 @@ +multinode: false + +modelArtifacts: + uri: "hf://Qwen/Qwen3-0.6B" + name: "Qwen/Qwen3-0.6B" + size: 20Gi + authSecretName: "llm-d-hf-token" + +routing: + servicePort: 8000 + proxy: + image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0 + connector: nixlv2 + secure: false + + inferencePool: + create: false + + httpRoute: + create: false + + epp: + create: false + +decode: + create: true + replicas: 2 + monitoring: + podmonitor: + enabled: true + portName: "metrics" # decode vLLM service port (from routing.proxy.targetPort) + path: "/metrics" + interval: "30s" + containers: + - name: "vllm" + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 + modelCommand: vllmServe + args: + - "--enforce-eager" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + env: + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_LOGGING_LEVEL + value: DEBUG + ports: + - containerPort: 5557 + protocol: TCP + - containerPort: 8200 + name: metrics + protocol: TCP + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + mountModelVolume: true + volumeMounts: + - name: metrics-volume + mountPath: /.config + - name: torch-compile-cache + mountPath: /.cache + volumes: + - name: metrics-volume + emptyDir: {} + - name: torch-compile-cache + emptyDir: {} + +# PD disabled in this example +prefill: + create: false diff --git a/k8s/guides/inference-scheduling/ms-inference-scheduling/values_tpu.yaml b/k8s/guides/inference-scheduling/ms-inference-scheduling/values_tpu.yaml new file mode 100644 index 000000000..c57ea48ce --- /dev/null +++ b/k8s/guides/inference-scheduling/ms-inference-scheduling/values_tpu.yaml @@ -0,0 +1,69 @@ +# This values.yaml file creates the resources for Qwen/Qwen3-0.6B +multinode: false + +modelArtifacts: + uri: "hf://meta-llama/Llama-3.1-70B-Instruct" + size: 160Gi + authSecretName: "llm-d-hf-token" + name: meta-llama/Llama-3.1-70B + +routing: + servicePort: 8000 + proxy: + image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0 + connector: nixlv2 + secure: false + + inferencePool: + create: false + + httpRoute: + create: false + + epp: + create: false + +accelerator: + type: google + +decode: + parallelism: + tensor: 8 + create: true + replicas: 2 + extraConfig: + nodeSelector: + cloud.google.com/gke-tpu-topology: "2x4" + cloud.google.com/gke-tpu-accelerator: "tpu-v6e-slice" + monitoring: + podmonitor: + enabled: true + portName: "metrics" # decode vLLM service port (from routing.proxy.targetPort) + path: "/metrics" + interval: "30s" + containers: + - name: "vllm" + image: "vllm/vllm-tpu:e92694b6fe264a85371317295bca6643508034ef" + modelCommand: vllmServe + args: + - "--tensor-parallel-size=8" + - "--max-model-len=4096" + ports: + - containerPort: 5557 + protocol: TCP + - containerPort: 8200 + name: metrics + protocol: TCP + mountModelVolume: true + volumeMounts: + - name: metrics-volume + mountPath: /.config + - name: torch-compile-cache + mountPath: /.cache + volumes: + - name: metrics-volume + emptyDir: {} + - name: torch-compile-cache + emptyDir: {} +prefill: + create: false diff --git a/k8s/guides/inference-scheduling/ms-inference-scheduling/values_xpu.yaml b/k8s/guides/inference-scheduling/ms-inference-scheduling/values_xpu.yaml new file mode 100644 index 000000000..1ed2d05fd --- /dev/null +++ b/k8s/guides/inference-scheduling/ms-inference-scheduling/values_xpu.yaml @@ -0,0 +1,104 @@ +# Simplified Intel XPU configuration using imageDefault mode +# This configuration lets the chart handle most vLLM parameters automatically + +modelArtifacts: + name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + uri: "hf://deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + size: 10Gi + authSecretName: "llm-d-hf-token" + +accelerator: + type: "intel" + +# Routing configuration - required for inference-scheduling +routing: + servicePort: 8000 + proxy: + image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0 + connector: nixlv2 + secure: false + + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-r1-inference-gateway + + inferencePool: + create: false + + # create httpRoute via MS for using release name r1 override + httpRoute: + create: true + + epp: + create: false + +decode: + create: true + replicas: 2 + monitoring: + podmonitor: + enabled: true + portName: "metrics" + path: "/metrics" + interval: "30s" + containers: + - name: "vllm" + # Use Intel latest published xpu image + image: ghcr.io/llm-d/llm-d-xpu:v0.3.0 + modelCommand: "vllmServe" + args: + - "--enforce-eager" + - "--dtype" + - "float16" + - "--disable-sliding-window" + securityContext: + fsGroup: 107 + supplementalGroups: + - 107 + env: + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: CCL_ZE_IPC_EXCHANGE + value: "pidfd" + - name: ZE_AFFINITY_MASK + value: "0" + ports: + - containerPort: 8200 + protocol: TCP + resources: + limits: + memory: 24Gi + cpu: "8" + gpu.intel.com/i915: "1" + requests: + cpu: "4" + memory: 12Gi + gpu.intel.com/i915: "1" + + mountModelVolume: true + volumeMounts: + - name: metrics-volume + mountPath: /.config + - name: torch-compile-cache + mountPath: /.cache + volumes: + - name: metrics-volume + emptyDir: {} + - name: torch-compile-cache + emptyDir: {} + + + # XPU-specific node affinity + acceleratorTypes: + labelKey: "accelerator" + labelValues: + - "intel-xpu" + - "intel-gpu-max" + +# Disable prefill for simple XPU example +prefill: + create: false + +# When true, use LeaderWorkerSet for multi-node XPU setups +multinode: false diff --git a/k8s/guides/pd-disaggregation/README.md b/k8s/guides/pd-disaggregation/README.md new file mode 100644 index 000000000..fd016bafc --- /dev/null +++ b/k8s/guides/pd-disaggregation/README.md @@ -0,0 +1,196 @@ +# Well-lit Path: P/D Disaggregation + +## Overview + +This guide demonstrates how to deploy Llama-70B using vLLM's P/D disaggregation support with NIXL. This guide has been validated on: + +* an 8xH200 cluster with InfiniBand networking +* an 8xH200 cluster on GKE with RoCE networking + +> WARNING: We are still investigating and optimizing performance for other hardware and networking configurations + +In this example, we will demonstrate a deployment of `Llama-3.3-70B-Instruct-FP8` with: + +- 4 TP=1 Prefill Workers +- 1 TP=4 Decode Worker + +## P/D Best Practices + +P/D disaggregation can benefit overall throughput by: + +- Specializing P and D workers for compute-bound vs latency-bound workloads +- Reducing the number of copies of the model (increasing KV cache RAM) with wide parallelism + +However, P/D disaggregation is not a target for all workloads. We suggest exploring P/D disaggregation for workloads with: + +- Large models (e.g. Llama-70B+, not Llama-8B) +- Longer input sequence lengths (e.g 10k ISL | 1k OSL, not 200 ISL | 200 OSL) +- Sparse MoE architectures with opportunities for wide-EP + +As a result, as you tune your P/D deployments, we suggest focusing on the following parameters: + +- **Heterogeneous Parallelism**: deploy P workers with less parallelism and more replicas and D workers with more parallelism and fewer replicas +- **xPyD Ratios**: tuning the ratio of P workers to D workers to ensure balance for your ISL|OSL ratio + +For very large models leveraging wide-EP, traffic for KV cache transfer may contend with expert parallelism when the ISL|OSL ratio is also high. We recommend starting with RDMA for KV cache transfer before attempting to leverage TCP, as TCP transfer requires more tuning of UCX under NIXL. + +## Hardware Requirements + +This guide expects 8 Nvidia GPUs of any kind, and RDMA via InfiniBand or RoCE between all pods in the workload. + +## Prerequisites + +- Have the [proper client tools installed on your local system](../prereq/client-setup/README.md) to use this guide. +- Ensure your cluster infrastructure is sufficient to [deploy high scale inference](../prereq/infrastructure) +- Configure and deploy your [Gateway control plane](../prereq/gateway-provider/README.md). +- [Create the `llm-d-hf-token` secret in your target namespace with the key `HF_TOKEN` matching a valid HuggingFace token](../prereq/client-setup/README.md#huggingface-token) to pull models. +- Have the [Monitoring stack](../../docs/monitoring/README.md) installed on your system. + +## Installation + +Use the helmfile to compose and install the stack. The Namespace in which the stack will be deployed will be derived from the `${NAMESPACE}` environment variable. If you have not set this, it will default to `llm-d-pd` in this example. + +```bash +export NAMESPACE=llm-d-pd # Or any namespace your heart desires +cd guides/pd-disaggregation +helmfile apply -n ${NAMESPACE} +``` + +**_NOTE:_** You can set the `$RELEASE_NAME_POSTFIX` env variable to change the release names. This is how we support concurrent installs. Ex: `RELEASE_NAME_POSTFIX=pd-2 helmfile apply -n ${NAMESPACE}` + +**_NOTE:_** This uses Istio as the default provider, see [Gateway Options](./README.md#gateway-options) for installing with a specific provider. + +### Gateway options + +To see specify your gateway choice you can use the `-e ` flag, ex: + +```bash +helmfile apply -e kgateway -n ${NAMESPACE} +``` + +To see what gateway options are supported refer to our [gateway provider prereq doc](../prereq/gateway-provider/README.md#supported-providers). Gateway configurations per provider are tracked in the [gateway-configurations directory](../prereq/gateway-provider/common-configurations/). + +You can also customize your gateway, for more information on how to do that see our [gateway customization docs](../../docs/customizing-your-gateway.md). + +#### Infrastructure provider specifics + +This guide uses RDMA via InfiniBand or RoCE for disaggregated serving kv-cache transfer. The resource attributes required to configure accelerator networking are not yet standardized via [Kubernetes Dynamic Resource Allocation](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/) and so are parameterized per infra provider in the Helm charts. If your provider has a custom setting you will need to update the charts before deploying. + +### Install HTTPRoute + +Follow provider specific instructions for installing HTTPRoute. + +#### Install for "kgateway" or "istio" + +```bash +kubectl apply -f httproute.yaml -n ${NAMESPACE} +``` + +#### Install for "gke" + +```bash +kubectl apply -f httproute.gke.yaml -n ${NAMESPACE} +``` + +## Verify the Installation + +- Firstly, you should be able to list all helm releases to view the 3 charts got installed into your chosen namespace: + +```bash +helm list -n ${NAMESPACE} +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gaie-pd llm-d-pd 1 2025-08-24 12:54:51.231537 -0700 PDT deployed inferencepool-v1.0.1 v1.0.1 +infra-pd llm-d-pd 1 2025-08-24 12:54:46.983361 -0700 PDT deployed llm-d-infra-v1.2.4 v0.2.0 +ms-pd llm-d-pd 1 2025-08-24 12:54:56.736873 -0700 PDT deployed llm-d-modelservice-v0.2.9 v0.2.0 +``` + +- Out of the box with this example you should have the following resources: + +```bash +kubectl get all -n ${NAMESPACE} +NAME READY STATUS RESTARTS AGE +pod/gaie-pd-epp-54444ddc66-qv6ds 1/1 Running 0 2m35s +pod/infra-pd-inference-gateway-istio-56d66db57f-zwtzn 1/1 Running 0 2m41s +pod/ms-pd-llm-d-modelservice-decode-84bf6d5bdd-jzfjn 2/2 Running 0 2m30s +pod/ms-pd-llm-d-modelservice-prefill-86f6fb7cdc-8kfb8 1/1 Running 0 2m30s +pod/ms-pd-llm-d-modelservice-prefill-86f6fb7cdc-g6wmp 1/1 Running 0 2m30s +pod/ms-pd-llm-d-modelservice-prefill-86f6fb7cdc-jx2w2 1/1 Running 0 2m30s +pod/ms-pd-llm-d-modelservice-prefill-86f6fb7cdc-vzcb8 1/1 Running 0 2m30s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/gaie-pd-epp ClusterIP 10.16.0.255 9002/TCP,9090/TCP 2m35s +service/gaie-pd-ip-bb618139 ClusterIP None 54321/TCP 2m35s +service/infra-pd-inference-gateway-istio LoadBalancer 10.16.3.74 10.16.4.3 15021:31707/TCP,80:34096/TCP 2m41s + +NAME READY UP-TO-DATE AVAILABLE AGE +deployment.apps/gaie-pd-epp 1/1 1 1 2m36s +deployment.apps/infra-pd-inference-gateway-istio 1/1 1 1 2m42s +deployment.apps/ms-pd-llm-d-modelservice-decode 1/1 1 1 2m31s +deployment.apps/ms-pd-llm-d-modelservice-prefill 4/4 4 4 2m31s + +NAME DESIRED CURRENT READY AGE +replicaset.apps/gaie-pd-epp-54444ddc66 1 1 1 2m36s +replicaset.apps/infra-pd-inference-gateway-istio-56d66db57f 1 1 1 2m42s +replicaset.apps/ms-pd-llm-d-modelservice-decode-84bf6d5bdd 1 1 1 2m31s +replicaset.apps/ms-pd-llm-d-modelservice-prefill-86f6fb7cdc 4 4 4 2m31s +``` + +**_NOTE:_** This assumes no other guide deployments in your given `${NAMESPACE}` and you have not changed the default release names via the `${RELEASE_NAME}` environment variable. + +## Using the stack + +For instructions on getting started making inference requests see [our docs](../../docs/getting-started-inferencing.md) + +## Tuning Selective PD + +Selective PD is a feature in the `inference-scheduler` within the context of prefill-decode dissagregation, although it is disabled by default. This features enables routing to just decode even with the P/D deployed. To enable it, you will need to set `threshold` value for the `pd-profile-handler` plugin, in the [GAIE values file](./gaie-pd/values.yaml). You can see the value of this here: + +```bash +cat gaie-pd/values.yaml | yq '.inferenceExtension.pluginsCustomConfig."pd-config.yaml"' | yq '.plugins[] | select(.type == "pd-profile-handler")' +type: pd-profile-handler +parameters: + threshold: 0 # update this + hashBlockSize: 5 +``` + +Some examples in which you might want to do selective PD might include: +- When the prompt is short enough that the amount of work split inference into prefill and decode phases, and then open a kv transfer between those two GPUs is greater than the amount of work to do both phases on the same decode inference worker. +- When Prefill units are at full capacity. + +For information on this plugin, see our [`pd-profile-handler` docs in the inference-scheduler](https://github.com/llm-d/llm-d-inference-scheduler/blob/v0.3.0/docs/architecture.md?plain=1#L205-L210) + +## Cleanup + +To remove the deployment: + +```bash +# Remove the model services +helmfile destroy -n ${NAMESPACE} + +# Remove the infrastructure +helm uninstall ms-pd -n ${NAMESPACE} +helm uninstall gaie-pd -n ${NAMESPACE} +helm uninstall infra-pd -n ${NAMESPACE} +``` + +**_NOTE:_** If you set the `$RELEASE_NAME_POSTFIX` environment variable, your release names will be different from the command above: `infra-$RELEASE_NAME_POSTFIX`, `gaie-$RELEASE_NAME_POSTFIX` and `ms-$RELEASE_NAME_POSTFIX`. + +### Cleanup HTTPRoute + +Follow provider specific instructions for deleting HTTPRoute. + +#### Cleanup for "kgateway" or "istio" + +```bash +kubectl delete -f httproute.yaml -n ${NAMESPACE} +``` + +#### Cleanup for "gke" + +```bash +kubectl delete -f httproute.gke.yaml -n ${NAMESPACE} +``` + +## Customization + +For information on customizing a guide and tips to build your own, see [our docs](../../docs/customizing-a-guide.md) diff --git a/k8s/guides/pd-disaggregation/README.xpu.md b/k8s/guides/pd-disaggregation/README.xpu.md new file mode 100644 index 000000000..c9955e6a3 --- /dev/null +++ b/k8s/guides/pd-disaggregation/README.xpu.md @@ -0,0 +1,332 @@ +# Intel XPU PD Disaggregation Deployment Guide +This document provides complete steps for deploying Intel XPU PD (Prefill-Decode) disaggregation service on Kubernetes cluster using DeepSeek-R1-Distill-Qwen-1.5B model. PD disaggregation separates the prefill and decode phases of inference, allowing for more efficient resource utilization and improved throughput. + +## Prerequisites +### Hardware Requirements +* Intel Data Center GPU Max 1550 or compatible Intel XPU device +* At least 8GB system memory +* Sufficient disk space (recommended at least 50GB available) + +### Software Requirements +* Kubernetes cluster (v1.28.0+) +* Intel GPU Plugin deployed +* kubectl access with cluster-admin privileges + +## Step 0: Build Intel XPU Docker Image (Optional) +If you need to customize the vLLM version or build the image from source, you can build the Intel XPU Docker image: + +### Clone Repository +```shell +# Clone the llm-d repository +git clone https://github.com/llm-d/llm-d +cd llm-d +``` +### Build Default Image +#### Intel Data Center GPU Max 1550 +```shell +# Build with default vLLM version (v0.11.0) +make image-build DEVICE=xpu VERSION=v0.2.1 +``` + +#### Intel Corporation Battlemage G21 +```shell +# Build with default vLLM version (v0.11.0) +git clone https://github.com/vllm-project/vllm.git +git checkout v0.11.0 +docker build -f docker/Dockerfile.xpu -t ghcr.io/llm-d/llm-d-xpu-dev:v0.3.0 --shm-size=4g . +``` + +### Available Build Arguments +* `VLLM_VERSION`: vLLM version to build (default: v0.11.0) +* `PYTHON_VERSION`: Python version (default: 3.12) +* `ONEAPI_VERSION`: Intel OneAPI toolkit version (default: 2025.1.3-0) + +**⚠️ Important**: + +* If you're using a pre-built image, you can skip this step and proceed directly to Step 1. +* If you build a custom image, remember to load it into your cluster (see Step 2 for Kind cluster loading instructions). +* **Repository Integration**: The llm-d-infra project has been integrated into the main llm-d repository. All previous references to separate llm-d-infra installations are now unified under the main llm-d project structure. + +## Step 1: Install Tool Dependencies +```shell +# Navigate to llm-d repository (use the same repo from Step 0) +cd llm-d + +# Install necessary tools (helm, helmfile, kubectl, yq, git, kind, etc.) +./guides/prereq/client-setup/install-deps.sh +curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64 && chmod +x ./kind && sudo mv ./kind /usr/local/bin/kind + +# Optional: Install development tools (including chart-testing) +./guides/prereq/client-setup/install-deps.sh --dev +``` + +**Installed tools include:** + +* helm (v3.12.0+) +* helmfile (v1.1.0+) +* kubectl (v1.28.0+) +* yq (v4+) +* git (v2.30.0+) + +## Step 2: Create Kubernetes Cluster +If you don't have a Kubernetes cluster, you can create one using Kind: + +```shell +# Use the same llm-d repository +cd llm-d + +# Create Kind cluster with Intel GPU support configuration +# Note: Adjust kind configuration for Intel XPU as needed +kind create cluster --name llm-d-cluster --image kindest/node:v1.28.15 + +# Verify cluster is running +kubectl cluster-info +kubectl get nodes +``` + +### Load Built Image into Cluster (If using custom built image) +If you built the Intel XPU image in Step 0, load it into the Kind cluster: + +```shell +# Load the built image into Kind cluster +kind load docker-image ghcr.io/llm-d/llm-d-xpu:v0.3.0 --name llm-d-cluster + +# Or if you built with custom tag +kind load docker-image llm-d:custom-xpu --name llm-d-cluster + +# Verify image is loaded +docker exec -it llm-d-cluster-control-plane crictl images | grep llm-d +``` + +**For Intel XPU deployments**: You must have the Intel GPU Plugin deployed on your cluster. The plugin provides the `gpu.intel.com/i915` resource that the Intel XPU workloads require. + +To deploy the Intel GPU Plugin: + +```shell +kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin?ref=v0.32.1' +``` + +**Note**: If you already have a Kubernetes cluster (v1.28.0+) with Intel GPU Plugin deployed, you can skip this step. + +## Step 3: Install Gateway API Dependencies +```shell +# Install Gateway API dependencies +cd guides/prereq/gateway-provider +./install-gateway-provider-dependencies.sh +``` + +## Step 4: Deploy Gateway Control Plane +```shell +# Deploy Istio Gateway control plane +cd guides/prereq/gateway-provider +helmfile apply -f istio.helmfile.yaml + +# Or deploy only control plane (if CRDs already exist) +helmfile apply -f istio.helmfile.yaml --selector kind=gateway-control-plane +``` + + +## Step 5: Create HuggingFace Token Secret +```shell +# Set environment variables +export NAMESPACE=llm-d-pd +export RELEASE_NAME_POSTFIX=pd +export HF_TOKEN_NAME=${HF_TOKEN_NAME:-llm-d-hf-token} +export HF_TOKEN=$your-hf-token + +# Create namespace +kubectl create namespace ${NAMESPACE} --dry-run=client -o yaml | kubectl apply -f - + +# Create HuggingFace token secret (empty token for public models) +kubectl create secret generic $HF_TOKEN_NAME --from-literal="HF_TOKEN=${HF_TOKEN}" --namespace ${NAMESPACE} +``` + + +## Step 6: Deploy Intel XPU PD Disaggregation +⚠️ **Important - For Intel BMG GPU Users**: Before running `helmfile apply`, you must update the GPU resource type in `ms-pd/values_xpu.yaml`: + +```yaml +# Edit ms-pd/values_xpu.yaml +accelerator: + type: intel + resources: + intel: "gpu.intel.com/xe" # Add gpu.intel.com/xe + +# Also update decode and prefill resource specifications: +decode: + containers: + - name: "vllm" + resources: + limits: + gpu.intel.com/xe: 1 # Change from gpu.intel.com/i915 to gpu.intel.com/xe + requests: + gpu.intel.com/xe: 1 # Change from gpu.intel.com/i915 to gpu.intel.com/xe + +prefill: + containers: + - name: "vllm" + resources: + limits: + gpu.intel.com/xe: 1 # Change from gpu.intel.com/i915 to gpu.intel.com/xe + requests: + gpu.intel.com/xe: 1 # Change from gpu.intel.com/i915 to gpu.intel.com/xe +``` + + +**Resource Requirements by GPU Type:** + +* **Intel Data Center GPU Max 1550**: Use `gpu.intel.com/i915` +* **Intel BMG GPU (Battlemage G21)**: Use `gpu.intel.com/xe` + +```shell +# Navigate to PD disaggregation guide directory +cd guides/pd-disaggregation + +# Deploy Intel XPU PD disaggregation configuration +helmfile apply -e xpu -n ${NAMESPACE} +``` + +This will deploy three main components in the `llm-d-pd` namespace: + +1. **infra-pd**: Gateway infrastructure for PD disaggregation +2. **gaie-pd**: Gateway API inference extension with PD-specific routing +3. **ms-pd**: Model service with separate prefill and decode deployments + +### Deployment Architecture +* **Decode Service**: 1 replica with 1 Intel GPUs +* **Prefill Service**: 3 replicas with 1 Intel GPU each +* **Total GPU Usage**: 4 Intel GPUs (1 for decode + 3 for prefill) + +## Step 7: Verify Deployment +### Check Helm Releases +```shell +helm list -n llm-d-pd +``` + +Expected output: + +``` +NAME NAMESPACE REVISION STATUS CHART +gaie-pd llm-d-pd 1 deployed inferencepool-v0.5.1 +infra-pd llm-d-pd 1 deployed llm-d-infra-v1.3.0 +ms-pd llm-d-pd 1 deployed llm-d-modelservice-v0.2.11 +``` + +### Check All Resources +```shell +kubectl get all -n llm-d-pd +``` + +### Monitor Pod Startup Status +```shell +# Check all PD pods status +kubectl get pods -n llm-d-pd + +# Monitor decode pod startup (real-time) +kubectl get pods -n llm-d-pd -l llm-d.ai/role=decode -w + +# Monitor prefill pods startup (real-time) +kubectl get pods -n llm-d-pd -l llm-d.ai/role=prefill -w +``` + +### View vLLM Startup Logs +#### Decode Pod Logs +```shell +# Get decode pod name +DECODE_POD=$(kubectl get pods -n llm-d-pd -l llm-d.ai/role=decode -o jsonpath='{.items[0].metadata.name}') + +# View vLLM container logs +kubectl logs -n llm-d-pd ${DECODE_POD} -c vllm -f + +# View recent logs +kubectl logs -n llm-d-pd ${DECODE_POD} -c vllm --tail=50 +``` + +#### Prefill Pod Logs +```shell +# Get prefill pod names +PREFILL_PODS=($(kubectl get pods -n llm-d-pd -l llm-d.ai/role=prefill -o jsonpath='{.items[*].metadata.name}')) + +# View first prefill pod logs +kubectl logs -n llm-d-pd ${PREFILL_PODS[0]} -f + +# View all prefill pod logs +for pod in "${PREFILL_PODS[@]}"; do + echo "=== Logs for $pod ===" + kubectl logs -n llm-d-pd $pod --tail=20 + echo "" +done +``` + +## Step 8: Create HTTPRoute for Gateway Access +### Check if HTTPRoute was Auto-Created +First, check if the HTTPRoute was automatically created by the Chart: + +```shell +# Check if HTTPRoute already exists +kubectl get httproute -n llm-d-pd +``` + +Note + +**HTTPRoute Auto-Creation**: When using `llm-d-modelservice` Chart v0.2.9+, the HTTPRoute is typically created automatically during deployment. If you see `ms-pd-llm-d-modelservice` HTTPRoute listed, you can skip the manual creation step below. + +### Manual HTTPRoute Creation (If Not Auto-Created) +If no HTTPRoute was found, create one manually: + +```shell +# Apply the HTTPRoute configuration from the PD disaggregation guide +kubectl apply -f httproute.yaml +``` + +### Verify HTTPRoute Configuration +Verify the HTTPRoute is properly configured: + +```shell +# Check HTTPRoute status +kubectl get httproute -n llm-d-pd + +# Check gateway attachment +kubectl get gateway infra-pd-inference-gateway -n llm-d-pd -o yaml | grep -A 5 attachedRoutes + +# View HTTPRoute details +kubectl describe httproute -n llm-d-pd +``` + +Expected output should show: + +* HTTPRoute connecting to `infra-pd-inference-gateway` +* Backend pointing to `gaie-pd` InferencePool +* Status showing `Accepted` and `ResolvedRefs` conditions + +## Step 9: Test PD Disaggregation Inference Service +### Get Gateway Service Information +```shell +kubectl get service -n llm-d-pd infra-pd-inference-gateway-istio +``` + +### Perform Inference Requests +#### Method 1: Using Port Forwarding (Recommended) +```shell +# Port forward to local +kubectl port-forward -n llm-d-pd service/infra-pd-inference-gateway-istio 8086:80 & + +# Test health check +curl -X GET "http://localhost:8086/health" -v + +# Perform inference test +curl -X POST "http://localhost:8086/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "messages": [ + { + "role": "user", + "content": "Explain the benefits of prefill-decode disaggregation in LLM inference" + } + ], + "max_tokens": 150, + "temperature": 0.7 + }' +``` + diff --git a/k8s/guides/pd-disaggregation/gaie-pd/values.yaml b/k8s/guides/pd-disaggregation/gaie-pd/values.yaml new file mode 100644 index 000000000..36d4d6267 --- /dev/null +++ b/k8s/guides/pd-disaggregation/gaie-pd/values.yaml @@ -0,0 +1,60 @@ +inferenceExtension: + replicas: 1 + image: + name: llm-d-inference-scheduler + hub: ghcr.io/llm-d + tag: v0.3.2 + pullPolicy: Always + extProcPort: 9002 + pluginsConfigFile: "pd-config.yaml" + pluginsCustomConfig: # THIS CONFIG NEEDS TO BE CHECKED FOR INF SCHEDULER NEW IMAGE + pd-config.yaml: | + # ALWAYS DO PD IN THIS EXAMPLE (THRESHOLD 0) + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: prefill-header-handler + - type: prefill-filter + - type: decode-filter + - type: max-score-picker + - type: queue-scorer + parameters: + hashBlockSize: 5 + maxPrefixBlocksToMatch: 256 + lruCapacityPerServer: 31250 + - type: pd-profile-handler + parameters: + threshold: 0 + hashBlockSize: 5 + schedulingProfiles: + - name: prefill + plugins: + - pluginRef: prefill-filter + - pluginRef: queue-scorer + weight: 1.0 + - pluginRef: max-score-picker + - name: decode + plugins: + - pluginRef: decode-filter + - pluginRef: queue-scorer + weight: 1.0 + - pluginRef: max-score-picker + + # Monitoring configuration for EPP + monitoring: + interval: "10s" + # Service account token secret for authentication + secret: + name: pd-gateway-sa-metrics-reader-secret + # Prometheus ServiceMonitor will be created when enabled for EPP metrics collection + prometheus: + enabled: true + +inferencePool: + apiVersion: inference.networking.x-k8s.io/v1alpha2 # use old API version for inference + targetPortNumber: 8000 + modelServerType: vllm + modelServers: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: ms-pd-llm-d-modelservice diff --git a/k8s/guides/pd-disaggregation/helmfile.yaml.gotmpl b/k8s/guides/pd-disaggregation/helmfile.yaml.gotmpl new file mode 100644 index 000000000..0876bbcba --- /dev/null +++ b/k8s/guides/pd-disaggregation/helmfile.yaml.gotmpl @@ -0,0 +1,125 @@ +environments: + istio: &I + values: + - ../prereq/gateway-provider/common-configurations/istio.yaml + istioBench: &IB + values: + - ../prereq/gateway-provider/common-configurations/istio.yaml + - ../prereq/gateway-provider/common-configurations/benchmarking.yaml + kgateway: &KG + values: + - ../prereq/gateway-provider/common-configurations/kgateway.yaml + gke: &GKE + values: + - ../prereq/gateway-provider/common-configurations/gke.yaml + xpu: &XPU + values: + - ../prereq/gateway-provider/common-configurations/istio.yaml + default: + <<: *IB + +--- + +{{- $ns := .Namespace | default "llm-d-pd" -}} +{{- $rn := (env "RELEASE_NAME_POSTFIX") | default "pd" -}} + +repositories: + - name: llm-d-modelservice + url: https://llm-d-incubation.github.io/llm-d-modelservice/ + - name: llm-d-infra + url: https://llm-d-incubation.github.io/llm-d-infra/ + +releases: + - name: {{ printf "infra-%s" $rn | quote }} + namespace: {{ $ns }} + chart: llm-d-infra/llm-d-infra + version: v1.3.3 + installed: true + labels: + type: infrastructure + kind: inference-stack + values: + - gateway: + {{ .Environment.Values.gateway | toYaml | nindent 10 }} + + - name: {{ printf "gaie-%s" $rn | quote }} + namespace: {{ $ns }} + chart: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + version: v1.0.1 + installed: true + needs: + - {{ printf "infra-%s" $rn | quote }} + values: + - gaie-pd/values.yaml + # Apply provider name if on GKE + {{- if or (eq .Environment.Name "gke") (eq .Environment.Name "gke_tpu") }} + - provider: + name: {{ .Environment.Values.provider.name }} + inferencePool: + apiVersion: {{ .Environment.Values.inferencePool.apiVersion }} + inferenceExtension: + monitoring: + gke: + enabled: true + prometheus: + enabled: false + {{- end }} + # Apply destination rule for anything istio + {{- if or (eq .Environment.Name "istio") (eq .Environment.Name "default") (eq .Environment.Name "istioBench") }} + - provider: + name: {{ .Environment.Values.provider.name }} + - istio: + {{ .Environment.Values.istio | toYaml | nindent 10 }} + - istio: + destinationRule: + host: {{ printf "gaie-%s-epp.%s.svc.cluster.local" $rn $ns | quote }} + {{- end }} + # Apply log level only in bench setting + {{- if or (eq .Environment.Name "istioBench") (eq .Environment.Name "default") }} + - inferenceExtension: + flags: + {{ .Environment.Values.inferenceExtension.flags | toYaml | nindent 12 }} + {{- end }} + labels: + kind: inference-stack + + - name: {{ printf "ms-%s" $rn | quote }} + namespace: {{ $ns }} + chart: llm-d-modelservice/llm-d-modelservice + version: v0.2.11 + installed: true + needs: + - {{ printf "infra-%s" $rn | quote }} + - {{ printf "gaie-%s" $rn | quote }} + values: + {{- if eq .Environment.Name "xpu" }} + - ms-pd/values_xpu.yaml + {{- else }} + - ms-pd/values.yaml + {{- end }} + {{- if or (eq .Environment.Name "istioBench") (eq .Environment.Name "default") }} + - routing: + {{ .Environment.Values.routing | toYaml | nindent 10 }} + {{- end }} + set: + # apply release name derived values + - name: "routing.inferencePool.name" + value: {{ printf "gaie-%s" $rn | quote }} + - name: "routing.parentRefs[0].name" + value: {{ printf "infra-%s-inference-gateway" $rn | quote }} + {{- if eq .Environment.Name "gke" }} + - name: 'decode.containers[0].resources.limits.rdma/ib' + value: "null" + - name: 'decode.containers[0].resources.requests.rdma/ib' + value: "null" + - name: 'prefill.containers[0].resources.limits.rdma/ib' + value: "null" + - name: 'prefill.containers[0].resources.requests.rdma/ib' + value: "null" + - name: "decode.monitoring.podmonitor.enabled" + value: false + - name: "prefill.monitoring.podmonitor.enabled" + value: false + {{- end }} + labels: + kind: inference-stack diff --git a/k8s/guides/pd-disaggregation/httproute.gke.yaml b/k8s/guides/pd-disaggregation/httproute.gke.yaml new file mode 100644 index 000000000..2983a9199 --- /dev/null +++ b/k8s/guides/pd-disaggregation/httproute.gke.yaml @@ -0,0 +1,19 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-d-pd-disaggregation +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-pd-inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: gaie-pd + weight: 1 + matches: + - path: + type: PathPrefix + value: / diff --git a/k8s/guides/pd-disaggregation/httproute.yaml b/k8s/guides/pd-disaggregation/httproute.yaml new file mode 100644 index 000000000..d749f3dee --- /dev/null +++ b/k8s/guides/pd-disaggregation/httproute.yaml @@ -0,0 +1,23 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-d-pd-disaggregation +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-pd-inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: gaie-pd + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + matches: + - path: + type: PathPrefix + value: / diff --git a/k8s/guides/pd-disaggregation/ms-pd/values.yaml b/k8s/guides/pd-disaggregation/ms-pd/values.yaml new file mode 100644 index 000000000..ffd560b0a --- /dev/null +++ b/k8s/guides/pd-disaggregation/ms-pd/values.yaml @@ -0,0 +1,151 @@ +multinode: false + +modelArtifacts: + uri: "hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" + size: 100Gi + authSecretName: "llm-d-hf-token" + name: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" + +routing: + servicePort: 8000 + proxy: + image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0 + connector: nixlv2 + secure: false + + inferencePool: + create: false + + httpRoute: + create: false + + epp: + create: false + +decode: + parallelism: + tensor: 4 + data: 1 + create: true + replicas: 1 + monitoring: + podmonitor: + enabled: true + portName: "metrics" # decode vLLM service port (from routing.proxy.targetPort) + path: "/metrics" + interval: "30s" + containers: + - name: "vllm" + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 + modelCommand: vllmServe + args: + # Keep tensor-parallelism as the first set of arguments + - "--tensor-parallel-size" + - "4" + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "32000" + env: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + ports: + - containerPort: 8200 + name: metrics + protocol: TCP + resources: + limits: + memory: 64Gi + cpu: "16" + nvidia.com/gpu: "4" + rdma/ib: 1 + requests: + memory: 64Gi + cpu: "16" + nvidia.com/gpu: "4" + rdma/ib: 1 + mountModelVolume: true + volumeMounts: + - name: metrics-volume + mountPath: /.config + - name: shm + mountPath: /dev/shm + - name: torch-compile-cache + mountPath: /.cache + volumes: + - name: metrics-volume + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + sizeLimit: "16Gi" + - name: torch-compile-cache + emptyDir: {} + +prefill: + create: true + replicas: 4 + monitoring: + podmonitor: + enabled: true + portName: "metrics" # prefill vLLM service port (from routing.servicePort) + path: "/metrics" + interval: "30s" + containers: + - name: "vllm" + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 + modelCommand: vllmServe + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "32000" + env: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + ports: + - containerPort: 8000 + name: metrics + protocol: TCP + resources: + limits: + memory: 64Gi + cpu: "8" + nvidia.com/gpu: "1" + rdma/ib: 1 + requests: + memory: 64Gi + cpu: "8" + nvidia.com/gpu: "1" + rdma/ib: 1 + mountModelVolume: true + volumeMounts: + - name: metrics-volume + mountPath: /.config + - name: shm + mountPath: /dev/shm + - name: torch-compile-cache + mountPath: /.cache + volumes: + - name: metrics-volume + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + sizeLimit: "16Gi" + - name: torch-compile-cache + emptyDir: {} diff --git a/k8s/guides/pd-disaggregation/ms-pd/values_xpu.yaml b/k8s/guides/pd-disaggregation/ms-pd/values_xpu.yaml new file mode 100644 index 000000000..2a4ee19eb --- /dev/null +++ b/k8s/guides/pd-disaggregation/ms-pd/values_xpu.yaml @@ -0,0 +1,163 @@ +# Intel XPU configuration for PD Disaggregation +# This configuration sets up prefill and decode services with Intel XPU optimization + +multinode: false + +# Configure accelerator type for Intel XPU +accelerator: + type: intel + +modelArtifacts: + uri: "hf://deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + size: 10Gi # Smaller model size for Intel XPU + authSecretName: "llm-d-hf-token" + name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + +routing: + servicePort: 8000 + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + # name: infra-pd-inference-gateway # set in helmfile + + proxy: + image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0 + connector: nixlv2 + secure: false + + inferencePool: + create: false + # name: gaie-pd # set in helmfile + + httpRoute: + create: false + + epp: + create: false + +decode: + create: true + replicas: 1 + monitoring: + podmonitor: + enabled: false + portName: "metrics" # decode vLLM service port (from routing.proxy.targetPort) + path: "/metrics" + interval: "30s" + containers: + - name: "vllm" + image: ghcr.io/llm-d/llm-d-xpu:v0.3.0 + modelCommand: vllmServe + args: + - "--tensor-parallel-size" + - "1" + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both", "kv_buffer_device":"cpu"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "32000" + env: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: "tcp" + - name: VLLM_WORKER_MULTIPROC_METHOD + value: "spawn" + - name: VLLM_USE_V1 + value: "1" + + ports: + - containerPort: 8200 + name: metrics + protocol: TCP + resources: + limits: + memory: 64Gi + cpu: "16" + gpu.intel.com/i915: "1" + requests: + memory: 64Gi + cpu: "16" + gpu.intel.com/i915: "1" + mountModelVolume: true + volumeMounts: + - name: metrics-volume + mountPath: /.config + - name: shm + mountPath: /dev/shm + - name: torch-compile-cache + mountPath: /.cache + volumes: + - name: metrics-volume + emptyDir: {} + - name: shm + emptyDir: {} + - name: torch-compile-cache + emptyDir: {} + +prefill: + create: true + replicas: 3 + monitoring: + podmonitor: + enabled: false + portName: "metrics" # prefill vLLM service port (from routing.servicePort) + path: "/metrics" + interval: "30s" + containers: + - name: "vllm" + image: ghcr.io/llm-d/llm-d-xpu:v0.3.0 + modelCommand: vllmServe + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both", "kv_buffer_device":"cpu"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "32000" + env: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: "tcp" + - name: VLLM_WORKER_MULTIPROC_METHOD + value: "spawn" + - name: VLLM_USE_V1 + value: "1" + ports: + - containerPort: 8000 + name: metrics + protocol: TCP + resources: + limits: + memory: 64Gi + cpu: "8" + gpu.intel.com/i915: "1" + requests: + memory: 64Gi + cpu: "8" + gpu.intel.com/i915: "1" + mountModelVolume: true + volumeMounts: + - name: metrics-volume + mountPath: /.config + - name: shm + mountPath: /dev/shm + - name: torch-compile-cache + mountPath: /.cache + volumes: + - name: metrics-volume + emptyDir: {} + - name: shm + emptyDir: {} + - name: torch-compile-cache + emptyDir: {} diff --git a/k8s/guides/precise-prefix-cache-aware/README.md b/k8s/guides/precise-prefix-cache-aware/README.md new file mode 100644 index 000000000..2113a9618 --- /dev/null +++ b/k8s/guides/precise-prefix-cache-aware/README.md @@ -0,0 +1,161 @@ +# Feature: Precise Prefix Cache Aware Routing + +## Overview + +This guide demonstrates how to configure the inference scheduler to use the new precise prefix cache aware routing based on [vLLM KV-Events](https://github.com/vllm-project/vllm/issues/16669) data. Precise prefix cache aware routing pulls up-to-date prefix cache status from serving instances, eliminating the need for additional indexing services and increasing cache hit rate at high throughput. + +## Prerequisites + +- Have the [proper client tools installed on your local system](../prereq/client-setup/README.md) to use this guide. +- Configure and deploy your [Gateway control plane](../prereq/gateway-provider/README.md). +- [Create the `llm-d-hf-token` secret in your target namespace with the key `HF_TOKEN` matching a valid HuggingFace token](../prereq/client-setup/README.md#huggingface-token) to pull models. +- Have the [Monitoring stack](../../docs/monitoring/README.md) installed on your system. + +## Installation + +Use the helmfile to compose and install the stack. The Namespace in which the stack will be deployed will be derived from the `${NAMESPACE}` environment variable. If you have not set this, it will default to `llm-d-precise` in this example. + +```bash +export NAMESPACE=llm-d-precise # Or any namespace your heart desires +cd guides/precise-prefix-cache-aware +helmfile apply -n ${NAMESPACE} +``` + +**_NOTE:_** You can set the `$RELEASE_NAME_POSTFIX` env variable to change the release names. This is how we support concurrent installs. Ex: `RELEASE_NAME_POSTFIX=kv-events-2 helmfile apply -n ${NAMESPACE}` + +**_NOTE:_** This uses Istio as the default provider, see [Gateway Options](./README.md#gateway-options) for installing with a specific provider. + +### Gateway options + +To see specify your gateway choice you can use the `-e ` flag, ex: + +```bash +helmfile apply -e kgateway -n ${NAMESPACE} +``` + +To see what gateway options are supported refer to our [gateway provider prereq doc](../prereq/gateway-provider/README.md#supported-providers). Gateway configurations per provider are tracked in the [gateway-configurations directory](../prereq/gateway-provider/common-configurations/). + +You can also customize your gateway, for more information on how to do that see our [gateway customization docs](../../docs/customizing-your-gateway.md). + +### Install HTTPRoute + +Follow provider specific instructions for installing HTTPRoute. + +#### Install for "kgateway" or "istio" + +```bash +kubectl apply -f httproute.yaml -n ${NAMESPACE} +``` + +#### Install for "gke" + +```bash +kubectl apply -f httproute.gke.yaml -n ${NAMESPACE} +``` + +## Verify the Installation + +- Firstly, you should be able to list all helm releases to view the 3 charts got installed into your chosen namespace: + +```bash +helm list -n ${NAMESPACE} +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gaie-kv-events llm-d-precise 1 2025-09-25 21:36:52.452999581 +0000 UTC deployed inferencepool-v1.0.1 v1.0.1 +infra-kv-events llm-d-precise 1 2025-09-25 21:36:50.848300265 +0000 UTC deployed llm-d-infra-v1.3.3 v0.3.0 +ms-kv-events llm-d-precise 1 2025-09-25 21:36:55.955958022 +0000 UTC deployed llm-d-modelservice-v0.2.11 v0.2.0 +``` + +- Out of the box with this example you should have the following resources: + +```bash +kubectl get all -n ${NAMESPACE} +NAME READY STATUS RESTARTS AGE +pod/gaie-kv-events-epp-687b78968b-wvswh 1/1 Running 0 80s +pod/infra-kv-events-inference-gateway-istio-949d87f84-zvsp2 1/1 Running 0 85s +pod/ms-kv-events-llm-d-modelservice-decode-b874d48d9-bgm5r 2/2 Running 0 75s +pod/ms-kv-events-llm-d-modelservice-decode-b874d48d9-ph64c 2/2 Running 0 75s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/gaie-kv-events-epp ClusterIP 10.16.2.44 9002/TCP,9090/TCP,5557/TCP 81s +service/gaie-kv-events-ip-805c964d ClusterIP None 54321/TCP 75s +service/infra-kv-events-inference-gateway-istio LoadBalancer 10.16.1.30 10.16.4.2 15021:32033/TCP,80:39332/TCP 86s + +NAME READY UP-TO-DATE AVAILABLE AGE +deployment.apps/gaie-kv-events-epp 1/1 1 1 81s +deployment.apps/infra-kv-events-inference-gateway-istio 1/1 1 1 86s +deployment.apps/ms-kv-events-llm-d-modelservice-decode 2/2 2 2 76s + +NAME DESIRED CURRENT READY AGE +replicaset.apps/gaie-kv-events-epp-687b78968b 1 1 1 81s +replicaset.apps/infra-kv-events-inference-gateway-istio-949d87f84 1 1 1 86s +replicaset.apps/ms-kv-events-llm-d-modelservice-decode-b874d48d9 2 2 2 76s +``` + +**_NOTE:_** This assumes no other guide deployments in your given `${NAMESPACE}` and you have not changed the default release names via the `${RELEASE_NAME}` environment variable. + +## Testing this "well lit path" + +We have docs on getting started sending inference requests [available here](../../docs/getting-started-inferencing.md) that are general to all examples. However, this example has unique instructions to interact with it which will be provided here: + +1. First, you will need to send a basic inference request to your gateway. For in depth documentation on how to do this, please see the link above, but a command will be provided to work out of the box with default settings: + +```bash +kubectl port-forward -n ${NAMESPACE} service/infra-kv-events-inference-gateway-istio 8000:80 +export LONG_TEXT_200_WORDS="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." + +curl -s http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "prompt": "'"$LONG_TEXT_200_WORDS"'", + "max_tokens": 50 + }' | jq +``` + +1. Check the inference-scheduler's prefix-cache-scorer's scores with the following command: + +```bash +kubectl logs -l inferencepool=gaie-kv-events-epp -n ${NAMESPACE} --tail 100 | grep "Calculated score" | grep "precise-prefix-cache-scorer/precise-prefix-cache-scorer" +``` + +You should see output similar to: + +```json +{"level":"Level(-4)","ts":"2025-10-07T16:07:36Z","caller":"framework/scheduler_profile.go:165","msg":"Calculated score","x-request-id":"77790804-deb4-441a-9a03-d771d8e20778","objectiveKey":"","incomingModelName":"Qwen/Qwen3-0.6B","targetModelName":"Qwen/Qwen3-0.6B","priority":0,"plugin":"precise-prefix-cache-scorer/precise-prefix-cache-scorer","endpoint":{"name":"ms-kv-events-llm-d-modelservice-decode-75499f8dc5-pbp84","namespace":"llm-d-precise"},"score":0} +{"level":"Level(-4)","ts":"2025-10-07T16:07:36Z","caller":"framework/scheduler_profile.go:165","msg":"Calculated score","x-request-id":"77790804-deb4-441a-9a03-d771d8e20778","objectiveKey":"","incomingModelName":"Qwen/Qwen3-0.6B","targetModelName":"Qwen/Qwen3-0.6B","priority":0,"plugin":"precise-prefix-cache-scorer/precise-prefix-cache-scorer","endpoint":{"name":"ms-kv-events-llm-d-modelservice-decode-75499f8dc5-kgnqh","namespace":"llm-d-precise"},"score":0} +``` + +1. Repeat the steps above to see the prefix-cache-scorer in action + +You should see output similar to: + +```json +{"level":"Level(-4)","ts":"2025-10-07T16:09:21Z","caller":"framework/scheduler_profile.go:165","msg":"Calculated score","x-request-id":"f4c967aa-ad15-4be2-8640-55164da18dfa","objectiveKey":"","incomingModelName":"Qwen/Qwen3-0.6B","targetModelName":"Qwen/Qwen3-0.6B","priority":0,"plugin":"precise-prefix-cache-scorer/precise-prefix-cache-scorer","endpoint":{"name":"ms-kv-events-llm-d-modelservice-decode-75499f8dc5-pbp84","namespace":"llm-d-precise"},"score":0} +{"level":"Level(-4)","ts":"2025-10-07T16:09:21Z","caller":"framework/scheduler_profile.go:165","msg":"Calculated score","x-request-id":"f4c967aa-ad15-4be2-8640-55164da18dfa","objectiveKey":"","incomingModelName":"Qwen/Qwen3-0.6B","targetModelName":"Qwen/Qwen3-0.6B","priority":0,"plugin":"precise-prefix-cache-scorer/precise-prefix-cache-scorer","endpoint":{"name":"ms-kv-events-llm-d-modelservice-decode-75499f8dc5-kgnqh","namespace":"llm-d-precise"},"score":1} +``` + +**_NOTE:_** These logs will only appear for unique requests, so if you don't see repeated instances of these logs make sure to redo them in a unique way. + +Notice that the second time we called the `/v1/completions` endpoint, the prefix-cache-scorer was able to return a score for the pod, +indicating that it had cached the KV-blocks from the first call. + +## Cleanup + +To remove the deployment: + +```bash +# Remove the model services +# From examples/precise-prefix-cache-aware +helmfile destroy -n ${NAMESPACE} + +# Or uninstall manually +helm uninstall infra-kv-events -n ${NAMESPACE} +helm uninstall gaie-kv-events -n ${NAMESPACE} +helm uninstall ms-kv-events -n ${NAMESPACE} +``` + +**_NOTE:_** If you set the `$RELEASE_NAME_POSTFIX` environment variable, your release names will be different from the command above: `infra-$RELEASE_NAME_POSTFIX`, `gaie-$RELEASE_NAME_POSTFIX` and `ms-$RELEASE_NAME_POSTFIX`. + +## Customization + +For information on customizing a guide and tips to build your own, see [our docs](../../docs/customizing-a-guide.md) diff --git a/k8s/guides/precise-prefix-cache-aware/gaie-kv-events/values.yaml b/k8s/guides/precise-prefix-cache-aware/gaie-kv-events/values.yaml new file mode 100644 index 000000000..3e8285472 --- /dev/null +++ b/k8s/guides/precise-prefix-cache-aware/gaie-kv-events/values.yaml @@ -0,0 +1,81 @@ +inferenceExtension: + replicas: 1 + image: + # both downstream infernece-scheduler and upstream epp images can support precise KV Cache awareness based on the configurations here + ################### + name: llm-d-inference-scheduler + hub: ghcr.io/llm-d + tag: v0.3.2 + ################### + # name: epp + # hub: registry.k8s.io/gateway-api-inference-extension + # tag: v1.0.1 + ################### + pullPolicy: Always + extProcPort: 9002 + # ZMQ port for `kvevents.Pool` (KVEvents subscriber) + extraContainerPorts: + - name: zmq + containerPort: 5557 + protocol: TCP + extraServicePorts: + - name: zmq + port: 5557 + targetPort: 5557 + protocol: TCP + # huggingface token for tokenizer + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: llm-d-hf-token + key: HF_TOKEN + flags: + # Log verbosity + - name: v + value: 4 + + pluginsConfigFile: "precise-prefix-cache-config.yaml" + pluginsCustomConfig: + precise-prefix-cache-config.yaml: | + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: single-profile-handler + - type: precise-prefix-cache-scorer + parameters: + indexerConfig: + tokenProcessorConfig: + blockSize: 64 + hashSeed: "42" + - type: kv-cache-utilization-scorer + - type: queue-scorer + - type: max-score-picker + schedulingProfiles: + - name: default + plugins: + - pluginRef: precise-prefix-cache-scorer + weight: 3.0 + - pluginRef: kv-cache-utilization-scorer + weight: 2.0 + - pluginRef: queue-scorer + weight: 2.0 + - pluginRef: max-score-picker + + # Monitoring configuration for EPP + monitoring: + interval: "10s" + # Service account token secret for authentication + secret: + name: prefix-routing-gateway-sa-metrics-reader-secret + + # Prometheus ServiceMonitor will be created when enabled for EPP metrics collection + prometheus: + enabled: true +inferencePool: + apiVersion: inference.networking.x-k8s.io/v1alpha2 # use old API version for inference + targetPortNumber: 8000 + modelServerType: vllm + modelServers: + matchLabels: + llm-d.ai/inferenceServing: "true" diff --git a/k8s/guides/precise-prefix-cache-aware/helmfile.yaml.gotmpl b/k8s/guides/precise-prefix-cache-aware/helmfile.yaml.gotmpl new file mode 100644 index 000000000..294c8d3e1 --- /dev/null +++ b/k8s/guides/precise-prefix-cache-aware/helmfile.yaml.gotmpl @@ -0,0 +1,110 @@ +environments: + istio: &I + values: + - ../prereq/gateway-provider/common-configurations/istio.yaml + istioBench: &IB + values: + - ../prereq/gateway-provider/common-configurations/istio.yaml + - ../prereq/gateway-provider/common-configurations/benchmarking.yaml + kgateway: &KG + values: + - ../prereq/gateway-provider/common-configurations/kgateway.yaml + gke: &GKE + values: + - ../prereq/gateway-provider/common-configurations/gke.yaml + default: + <<: *I + +--- + +{{- $ns := .Namespace | default "llm-d-precise" -}} +{{- $rn := (env "RELEASE_NAME_POSTFIX") | default "kv-events" -}} + +repositories: + - name: llm-d-modelservice + url: https://llm-d-incubation.github.io/llm-d-modelservice/ + - name: llm-d-infra + url: https://llm-d-incubation.github.io/llm-d-infra/ + +releases: + - name: {{ printf "infra-%s" $rn | quote }} + namespace: {{ $ns }} + chart: llm-d-infra/llm-d-infra + version: v1.3.3 + installed: true + labels: + type: infrastructure + kind: inference-stack + values: + - gateway: + {{ .Environment.Values.gateway | toYaml | nindent 10 }} + + - name: {{ printf "gaie-%s" $rn | quote }} + namespace: {{ $ns }} + chart: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + version: v1.0.1 + installed: true + needs: + - {{ printf "infra-%s" $rn | quote }} + values: + - gaie-kv-events/values.yaml + # Apply provider name if on GKE + {{- if eq .Environment.Name "gke" }} + - provider: + name: {{ .Environment.Values.provider.name }} + inferencePool: + apiVersion: {{ .Environment.Values.inferencePool.apiVersion }} + inferenceExtension: + monitoring: + gke: + enabled: true + prometheus: + enabled: false + {{- end }} + # Apply destination rule for anything istio + {{- if or (eq .Environment.Name "istio") (eq .Environment.Name "default") (eq .Environment.Name "istioBench") }} + - provider: + name: {{ .Environment.Values.provider.name }} + - istio: + {{ .Environment.Values.istio | toYaml | nindent 10 }} + - istio: + destinationRule: + host: {{ printf "gaie-%s-epp.%s.svc.cluster.local" $rn $ns | quote }} + {{- end }} + # Apply log level only in bench setting + {{- if (eq .Environment.Name "istioBench") }} + - inferenceExtension: + flags: + {{ .Environment.Values.inferenceExtension.flags | toYaml | nindent 12 }} + {{- end }} + labels: + kind: inference-stack + + - name: {{ printf "ms-%s" $rn | quote }} + namespace: {{ $ns }} + chart: llm-d-modelservice/llm-d-modelservice + version: v0.2.11 + installed: true + needs: + - {{ printf "infra-%s" $rn | quote }} + - {{ printf "gaie-%s" $rn | quote }} + values: + - ms-kv-events/values.yaml + {{- if (eq .Environment.Name "istioBench") }} + - routing: + {{ .Environment.Values.routing | toYaml | nindent 10 }} + {{- end }} + set: + # apply release name derived values + - name: "routing.inferencePool.name" + value: {{ printf "gaie-%s" $rn | quote }} + - name: "routing.parentRefs[0].name" + value: {{ printf "infra-%s-inference-gateway" $rn | quote }} + - name: "decode.containers[0].env[0].value" + value: {{ printf "%s" $rn | quote }} + {{- if (eq .Environment.Name "gke") }} + - name: "decode.monitoring.podmonitor.enabled" + value: false + {{- end }} + labels: + kind: inference-stack diff --git a/k8s/guides/precise-prefix-cache-aware/httproute.gke.yaml b/k8s/guides/precise-prefix-cache-aware/httproute.gke.yaml new file mode 100644 index 000000000..a11a4861a --- /dev/null +++ b/k8s/guides/precise-prefix-cache-aware/httproute.gke.yaml @@ -0,0 +1,19 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-d-kv-events +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-kv-events-inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: gaie-kv-events + weight: 1 + matches: + - path: + type: PathPrefix + value: / diff --git a/k8s/guides/precise-prefix-cache-aware/httproute.yaml b/k8s/guides/precise-prefix-cache-aware/httproute.yaml new file mode 100644 index 000000000..32119f634 --- /dev/null +++ b/k8s/guides/precise-prefix-cache-aware/httproute.yaml @@ -0,0 +1,23 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-d-kv-events +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-kv-events-inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: gaie-kv-events + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + matches: + - path: + type: PathPrefix + value: / diff --git a/k8s/guides/precise-prefix-cache-aware/ms-kv-events/values.yaml b/k8s/guides/precise-prefix-cache-aware/ms-kv-events/values.yaml new file mode 100644 index 000000000..c744a8c59 --- /dev/null +++ b/k8s/guides/precise-prefix-cache-aware/ms-kv-events/values.yaml @@ -0,0 +1,101 @@ +multinode: false + +modelArtifacts: + uri: "hf://Qwen/Qwen3-0.6B" + size: 20Gi + authSecretName: "llm-d-hf-token" + name: "Qwen/Qwen3-0.6B" + +routing: + servicePort: 8000 + proxy: + image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0 + connector: nixlv2 + secure: false + + inferencePool: + create: false + + httpRoute: + create: false + + epp: + create: false + +decode: + create: true + replicas: 2 + monitoring: + podmonitor: + enabled: true + portName: "metrics" # decode vLLM service port (from routing.proxy.targetPort) + path: "/metrics" + interval: "30s" + containers: + - name: "vllm" + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 + modelCommand: custom + command: + - /bin/sh + - '-c' + args: + - | + vllm serve Qwen/Qwen3-0.6B \ + --host 0.0.0.0 \ + --port 8200 \ + --block-size 64 \ + --prefix-caching-hash-algo sha256_cbor \ + --enforce-eager \ + --kv-transfer-config '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' \ + --kv-events-config "{\"enable_kv_cache_events\":true,\"publisher\":\"zmq\",\"endpoint\":\"tcp://gaie-${GAIE_RELEASE_NAME_POSTFIX}-epp.${NAMESPACE}.svc.cluster.local:5557\",\"topic\":\"kv@${POD_IP}@Qwen/Qwen3-0.6B\"}" + env: + - name: GAIE_RELEASE_NAME_POSTFIX # index 0 matters because of set in helmfile + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace # assumed to be the same as the EPP's + - name: PYTHONHASHSEED + value: "42" + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_LOGGING_LEVEL + value: DEBUG + ports: + - containerPort: 5557 + protocol: TCP + - containerPort: 8200 + name: metrics + protocol: TCP + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + mountModelVolume: true + volumeMounts: + - name: metrics-volume + mountPath: /.config + - name: torch-compile-cache + mountPath: /.cache + volumes: + - name: metrics-volume + emptyDir: {} + - name: torch-compile-cache + emptyDir: {} + +# PD disabled in this example +prefill: + create: false diff --git a/k8s/guides/predicted-latency-based-scheduling/README.md b/k8s/guides/predicted-latency-based-scheduling/README.md new file mode 100644 index 000000000..9aa3a25ea --- /dev/null +++ b/k8s/guides/predicted-latency-based-scheduling/README.md @@ -0,0 +1,324 @@ +# Experimental Feature: Predicted Latency based Load Balancing + +## Overview + +This experimental feature introduces **predicted latency based load balancing**, where scheduling decisions are guided by real-time predictions of request latency rather than only utilization metrics like queue depth or KV-cache utilization. + +- **Problem:** Utilization-based load balancing misses some distinct characteristics of LLM workloads, leading to requests missing SLO targets or leads to overly conservative routing that wastes capacity. +- **Approach:** The Endpoint Picker (EPP) integrates with **in-pod latency predictor sidecars** that continuously learn from live traffic. These sidecars estimate **p90 TTFT** and **p90 TPOT** for each candidate pod given current load, prefix cache state, and request features. +- **Outcome:** The **SLO scorer** compares predictions against per-request SLOs and directs traffic to pods with some headroom. If none exist, requests are shed (priority < 0) or sent to a weighted pool favoring lower latency pods. +### Tradeoffs & Gaps + +- **Homogeneous InferencePool** + Current predictors assume that all model server pods are identical (same GPU type, model weights, and serving configuration). Heterogeneous pools are not yet modeled. + +- **Scaling limits** + Each prediction sidecar can sustain ~300 QPS on a c4-standard-192 Google cloud machine (**≈ 192 vCPUs, 720 GB RAM, Up to 100 Gbps network, Up to 200 Gbps aggregate throughput**). Because the EPP makes one prediction call per candidate pod, total prediction load grows with both **cluster QPS** and **pod count**. If traffic or pod count increases, prediction servers must be scaled horizontally. + +- **Training mode** + Only streaming workloads (set **"stream": "true"** in the request body as per openAI protocol) are supported. + +- **Percentiles** + The predictor currently estimates only **p90** TTFT and TPOT. Other percentiles (p95, p99) or a mix of percentiles are not yet available. + +- **Prefill/Decode disaggregation** + Current routing does **not support prefill/decode disaggregation** (where one pod performs prefill and another performs decode). Prediction and SLO scoring assume a pod executes the entire request lifecycle. Support for disaggregated serving is a **work in progress**. + +- **Unvalidated against advanced inference features** + Predictions have not yet been tested with advanced serving strategies such as LoRA adapters, speculative decoding, or beam search. Each of these may shift latency characteristics (e.g., speculative decoding may reduce TTFT but increase TPOT variance), and models may need to be extended to remain accurate in these contexts. + + +### What is Tested + +This feature has been validated against the scenarios described in the [original design doc](https://docs.google.com/document/d/1q56wr3N5XGx0B21MzHu5oBsCiGi9VrbZAvyhP2VFG_c/edit?tab=t.0#heading=h.ob7j9esmcyd3) — including **short-prompt/long-completion**, **long-prompt/short-completion**, and **mixed workloads** — to compare baseline inference gateway routing versus prediction-based SLO routing. The benchmarking results are included in this doc. + +This guide explains how to deploy EPP with latency predictor sidecars, configure profiles and scorers, and enable **SLO-aware routing** via headers. + +--- + +## Prerequisites + +- **Install the Inference Gateway extension** + Follow the official installation steps here: + https://gateway-api-inference-extension.sigs.k8s.io/guides/ + +- **Build your EPP image** from the experimental branch: + + + ***Prerequisites*** + - Docker/BuildKit installed + - Access to a container registry (e.g., GCP Artifact Registry, Docker Hub, ECR) + + ***Clone & checkout*** + ```bash + git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension.git + cd gateway-api-inference-extension + git checkout slo-prediction-experimental + ``` + + ***Set your target registry and tag*** + ``` + export IMG="/epp:slo-prediction-$(git rev-parse --short HEAD)" + ``` + + ***Build the image*** + ``` + docker build -t "$IMG" -f Dockerfile . + ``` + + ***Push the image*** + ``` + docker push "$IMG" + ``` + +- **Build your EPP Sidecars** from the same experimental branch as described here: + https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/slo-prediction-experimental/latencypredictor-v1 + +--- + +## Testing Predicted Latency based Scheduling + +Once prerequisites are met, you can validate predicted latency based scheduling: + +1. **Apply your InferencePool/EPP manifest** + - Consult the example manifest shown [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/slo-prediction-experimental/config/manifests/inferencepool-resources-lp.yaml) + - Update the EPP container and sidecar images to the ones you built. + - Confirm that the `Deployment` includes the EPP container, training sidecar, and three prediction sidecars, each with their own volumes. + - Ensure the `plugins-config` ConfigMap defines both `default` and `slo` profiles. + +2. **Check readiness** + - Verify pod status: `kubectl get pods` → all containers `Running/Ready`. + - Training sidecar health: `curl http://:8000/readyz` + - Prediction sidecar health: `curl http://:8001/readyz` (and 8002, 8003). + - EPP gRPC health: port `9003` (liveness/readiness probes). + +3. **Send traffic** + - **Baseline:** run requests using the **`default`** profile (no prediction headers). + - **SLO-aware:** run requests with the **`slo`** profile and set + `x-prediction-based-scheduling: true`, optionally adding SLO headers like `x-slo-ttft-ms` and `x-slo-tpot-ms`. + + Example request: + ```bash + curl -v $GW_IP/v1/completions \ + -H 'Content-Type: application/json' \ + -H 'x-prediction-based-scheduling: true' \ + -H 'x-slo-ttft-ms: 200' \ + -H 'x-slo-tpot-ms: 50' \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "prompt": "what is the difference between Franz and Apache Kafka?", + "max_tokens": 200, + "temperature": 0, + "stream_options": {"include_usage": "true"}, + "stream": "true" + }' + ``` + + Example response (abridged SSE): + ``` + < HTTP/1.1 200 OK + < content-type: text/event-stream; charset=utf-8 + ... + data: {"choices":[{"index":0,"text":" Apache"}], "object":"text_completion", ...} + data: {"choices":[{"index":0,"text":" Kafka"}], "object":"text_completion", ...} + ... (many streamed tokens) ... + data: { + "object":"text_completion", + "usage": { + "prompt_tokens": 12, + "completion_tokens": 200, + "total_tokens": 212, + "ttft_ms": 59, + "tpot_observations_ms": [9, 6], + "avg_tpot_ms": 7.5, + "predicted_ttft_ms": 273.23, + "predicted_tpot_observations_ms": [176.22, 18.17], + "avg_predicted_tpot_ms": 97.19 + } + } + data: [DONE] + ``` + + - The final SSE frame includes both **predictions and actuals** so you can validate accuracy (e.g., `predicted_ttft_ms` vs `ttft_ms`). + - TPOTs are sampled every 200th token and surfaced in the arrays like `tpot_observations_ms`. + +4. **Validate predictions in logs** + Tail EPP logs at verbosity `-v=4`. For each request you should see: + + - **Profile selection** + ``` + msg:"Running profile handler, Pick profiles" + plugin:"slo-aware-profile-handler/slo-aware-profile-handler" + ``` + + - **Candidate pods** + ``` + msg:"Before running scorer plugins" + pods:[{... "pod_name":"...-5k7qr" ...}, {... "pod_name":"...-9lp5g" ...}] + ``` + + - **SLO scorer pod scores** + ``` + msg:"Pod score" + scorer_type:"slo-scorer" + pod_name:"vllm-llama3-8b-instruct-7b584dd595-9b4wt" + score:0.82 + ``` + + - **Final pick** + ``` + msg:"Picked endpoint" + scorer_type:"slo-scorer" + selected_pod:"vllm-llama3-8b-instruct-7b584dd595-9b4wt" + ``` + + These logs confirm: + - The request entered the SLO-aware path. + - All candidate pods were evaluated. + - Scores reflect predicted headroom vs SLOs. + - The final pod was chosen based on SLO scorer output. + +5. **Confirm request shedding (optional)** + If you send requests with **priority < 0** and no pod can meet both TTFT & TPOT SLOs, logs should show the request being **shed** instead of placed in the negative bucket. + +--- + +## Configuration + +This section details the container setup, ConfigMaps, and profile configuration needed to enable prediction-based scheduling. + +### Sidecars & EPP containers in the Deployment + +**EPP container** +- **Image**: `epp` +- **Args** + - `--config-file=/config/default-plugins.yaml` + - `--enable-latency-predictor` +- **Env** + - `PREDICTION_SERVER_URL`: CSV of in-pod predictor endpoints + - `TRAINING_SERVER_URL`: `http://localhost:8000` + - `LATENCY_MAX_SAMPLE_SIZE` + - `NEG_HEADROOM_TTFT_WEIGHT`, `NEG_HEADROOM_TPOT_WEIGHT` + - `HEADROOM_TTFT_WEIGHT`, `HEADROOM_TPOT_WEIGHT` + - `HEADROOM_SELECTION_STRATEGY` + - `SLO_BUFFER_FACTOR` + +**Training sidecar (`training-server`)** +- **Port**: 8000 +- **EnvFrom**: `latency-predictor-config` +- **Volume**: `/models` + +**Prediction sidecars (`prediction-server-1/2/3`)** +- **Ports**: 8001, 8002, 8003 +- **EnvFrom**: `prediction-server-config` +- **Volumes**: `/server_models` + +--- + +### ConfigMaps + +**1. `latency-predictor-config` (training)** + +```yaml +data: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" +``` + +**2. `prediction-server-config` (predictors)** + +```yaml +data: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" +``` + +--- + +### Profiles & Plugins + +`plugins-config` ConfigMap (`default-plugins.yaml`): + +```yaml +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: + - type: queue-scorer + - type: kv-cache-utilization-scorer + - type: prefix-cache-scorer + - type: slo-request-tracker + - type: slo-scorer + - type: slo-aware-profile-handler + - type: max-score-picker + +schedulingProfiles: + - name: default + plugins: + - pluginRef: slo-request-tracker + - pluginRef: prefix-cache-scorer + - pluginRef: queue-scorer + - pluginRef: kv-cache-utilization-scorer + - pluginRef: max-score-picker + + - name: slo + plugins: + - pluginRef: prefix-cache-scorer + weight: 0 + - pluginRef: slo-request-tracker + - pluginRef: slo-scorer + - pluginRef: max-score-picker +``` + +**What they do** +- `slo-request-tracker` — captures per-request SLOs and tracks them. +- `slo-scorer` — uses predicted TTFT/TPOT to compare against SLOs and classify into positive/negative buckets. +- `slo-aware-profile-handler` — switches requests into the `slo` profile when SLO headers are present. +- `queue-scorer`, `kv-cache-utilization-scorer`, `prefix-cache-scorer` — baseline scoring plugins. + +--- + +### Headroom strategies + +Tune positive vs negative headroom scoring with env vars: + +- `HEADROOM_SELECTION_STRATEGY` — `least` (compact) or `most` (spread) +- `HEADROOM_TTFT_WEIGHT` / `HEADROOM_TPOT_WEIGHT` — blend weights for positive headroom +- `NEG_HEADROOM_TTFT_WEIGHT` / `NEG_HEADROOM_TPOT_WEIGHT` — blend weights for deficits +- `SLO_BUFFER_FACTOR` — safety multiplier on TPOT SLOs + +--- + +### Enable prediction-based scheduling + +Turn on SLO-aware routing per request with the header: + +``` +x-prediction-based-scheduling: true +``` + +- If **SLO headers are present**: predictions are compared against thresholds. +- If **no SLOs** are provided: treated as SLO=0 → lowest latency pod is chosen. +- If **priority < 0** and **no pod can meet SLOs**: request is **shed** instead of placed in the negative bucket. + +**Current limitations** +- Percentile: only **p90** supported. +- Training: only **streaming mode** supported. +- TPOT sampling: for obsevability, every 200th token is logged and compared with predictions. + +--- + +## Cleanup + +To remove the resources you created in this walkthrough, follow the same cleanup instructions from the [Inference Gateway Extension guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/#cleanup). + +That section covers how to delete the InferencePool, ConfigMaps, and supporting resources you applied here. The steps are identical — only the EPP image and sidecar configuration differ. \ No newline at end of file diff --git a/k8s/guides/prereq/client-setup/README.md b/k8s/guides/prereq/client-setup/README.md new file mode 100644 index 000000000..93beae560 --- /dev/null +++ b/k8s/guides/prereq/client-setup/README.md @@ -0,0 +1,45 @@ +# Client Setup Prerequisites + +llm-d guides use a standard set of client tools on Linux and Mac OSX. The provided [install-deps.sh](./install-deps.sh) script will download and install the tools below. + +## Supported Development Platforms + +Currently llm-d community only supports OSX and Linux development. + +## Required Tools + +| Binary | Minimum Required Version | Download / Installation Instructions | +| ----------- | ------------------------ | ----------------------------------------------------------------------------------------------- | +| `yq` | v4+ | [yq (mikefarah) – installation](https://github.com/mikefarah/yq?tab=readme-ov-file#install) | +| `git` | v2.30.0+ | [git – installation guide](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) | +| `helm` | v3.12.0+ | [Helm – quick-start install](https://helm.sh/docs/intro/install/) | +| `helmfile` | v1.1.0+ | [Helmfile - installation](https://github.com/helmfile/helmfile?tab=readme-ov-file#installation) | +| `kubectl` | v1.28.0+ | [kubectl – install & setup](https://kubernetes.io/docs/tasks/tools/install-kubectl/) | + +### Optional Tools + +| Binary | Recommended Version | Download / Installation Instructions | +| ------------------ | ------------------------ | ------------------------------------------------------------------------------------------------ | +| `stern` | 1.30+ | [stern - installation](https://github.com/stern/stern?tab=readme-ov-file#installation) | +| `helm diff` plugin | v3.10.0+ | [helm diff installation docs](https://github.com/databus23/helm-diff?tab=readme-ov-file#install) | + +## HuggingFace Token + +Most guides download their model from Huggingface directly in the `llm-d` image. There are exceptions to this like the [`simulated-accelerators` guide](../../simulated-accelerators/) that uses no model, or the [`wide-ep-lws` guide](../../wide-ep-lws/) which uses a model loaded from storage directly on the nodes for faster development cycle iterations. + +For the rest you will need to create a Kubernetes secret in your deployment namespace containing your HuggingFace Token. For more information on getting a token, see [the huggingface docs](https://huggingface.co/docs/hub/en/security-tokens). + +The following script will create the token in the current namespace using the name `llm-d-hf-token`, which is used in all guides: + +```bash +export HF_TOKEN= +export HF_TOKEN_NAME=${HF_TOKEN_NAME:-llm-d-hf-token} +kubectl create secret generic ${HF_TOKEN_NAME} \ + --from-literal="HF_TOKEN=${HF_TOKEN}" \ + --namespace "${NAMESPACE}" \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +## Pulling llm-d Images from GitHub Container Registry (GHCR) + +All of the container images in the `llm-d` organization are public on GitHub and require no authentication to pull. diff --git a/k8s/guides/prereq/client-setup/install-deps.sh b/k8s/guides/prereq/client-setup/install-deps.sh new file mode 100755 index 000000000..0d79c2240 --- /dev/null +++ b/k8s/guides/prereq/client-setup/install-deps.sh @@ -0,0 +1,207 @@ +#!/usr/bin/env bash +# -*- indent-tabs-mode: nil; tab-width: 4; sh-indentation: 4; -*- + +set -euo pipefail + +######################################## +# Component versions +######################################## +# Helm version +HELM_VER="v3.17.3" +# Helmdiff version +HELMDIFF_VERSION="v3.11.0" +# Helmfile version +HELMFILE_VERSION="1.1.3" +# chart-testing version +CT_VERSION="3.12.0" + +######################################## +# Usage function +######################################## +show_usage() { + cat << EOF +Usage: $0 [OPTIONS] + +Install essential tools for llm-d deployment. + +OPTIONS: + --dev Install additional development tools (chart-testing) + -h, --help Show this help message and exit + +EXAMPLES: + $0 Install basic tools only + $0 --dev Install basic tools + development tools + $0 --help Show this help message + +TOOLS INSTALLED: + Basic tools: + - git, curl, tar (system packages) + - yq (YAML processor) + - kubectl (Kubernetes CLI) + - helm (Helm package manager) + - helm diff plugin (optional but highly recommended) + - helmfile (Helm deployment tool) + + Development tools (with --dev): + - chart-testing (Helm chart testing tool) + +EOF +} + +######################################## +# Parse command line arguments +######################################## +DEV_MODE=false +for arg in "$@"; do + case $arg in + --dev) + DEV_MODE=true + ;; + -h|--help) + show_usage + exit 0 + ;; + *) + echo "Unknown option: $arg" + echo "Use --help for usage information." + exit 1 + ;; + esac +done + +######################################## +# Helper: detect current OS / ARCH +######################################## +OS=$(uname | tr '[:upper:]' '[:lower:]') +ARCH=$(uname -m) +case "$ARCH" in + arm64|aarch64) ARCH="arm64" ;; + x86_64) ARCH="amd64" ;; + *) echo "Unsupported architecture: $ARCH"; exit 1 ;; +esac + +######################################## +# Helper: install a package via the +# best available package manager +######################################## +install_pkg() { + PKG="$1" + if [[ "$OS" == "linux" ]]; then + if command -v apt &> /dev/null; then + sudo apt-get install -y "$PKG" + elif command -v dnf &> /dev/null; then + sudo dnf install -y "$PKG" + elif command -v yum &> /dev/null; then + sudo yum install -y "$PKG" + else + echo "Unsupported Linux distro (no apt, dnf, or yum)."; + exit 1 + fi + elif [[ "$OS" == "darwin" ]]; then + if command -v brew &> /dev/null; then + brew install "$PKG" + else + echo "Homebrew not found. Please install Homebrew or add manual install logic."; + exit 1 + fi + else + echo "Unsupported OS: $OS"; + exit 1 + fi +} + +######################################## +# Base utilities +######################################## +for pkg in git curl tar; do + if ! command -v "$pkg" &> /dev/null; then + install_pkg "$pkg" + fi +done + +######################################## +# yq (v4+) +######################################## +if ! command -v yq &> /dev/null; then + echo "Installing yq..." + curl -sLo yq \ + "https://github.com/mikefarah/yq/releases/latest/download/yq_${OS}_${ARCH}" + chmod +x yq + sudo mv yq /usr/local/bin/yq +fi + +if ! yq --version 2>&1 | grep -q 'mikefarah'; then + echo "Detected yq is not mikefarah’s yq. Please uninstall your current yq and re-run this script." + exit 1 +fi +######################################## +# kubectl +######################################## +if ! command -v kubectl &> /dev/null; then + echo "Installing kubectl..." + K8S_URL="https://dl.k8s.io/release/$(curl -sL https://dl.k8s.io/release/stable.txt)" + curl -sLO "${K8S_URL}/bin/${OS}/${ARCH}/kubectl" + if [[ "$OS" == "darwin" ]]; then + sudo install -m 0755 kubectl /usr/local/bin/kubectl + else + sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl + fi + rm kubectl +fi + +######################################## +# Helm +######################################## +if ! command -v helm &> /dev/null; then + echo "Installing Helm..." + TARBALL="helm-${HELM_VER}-${OS}-${ARCH}.tar.gz" + curl -sLO "https://get.helm.sh/${TARBALL}" + tar -zxvf "${TARBALL}" + sudo mv "${OS}-${ARCH}/helm" /usr/local/bin/helm + rm -rf "${OS}-${ARCH}" "${TARBALL}" +fi + +######################################## +# Helm diff plugin +######################################## +if ! helm plugin list | grep -q diff; then + echo "📦 helm-diff plugin not found. Installing ${HELMDIFF_VERSION}..." + helm plugin install --version "${HELMDIFF_VERSION}" https://github.com/databus23/helm-diff +fi + +######################################## +# helmfile +######################################## +if ! command -v helmfile &> /dev/null; then + echo "📦 helmfile not found. Installing ${HELMFILE_VERSION}..." + if [[ "$OS" == "darwin" && "$ARCH" == "arm64" ]]; then + ARCHIVE="helmfile_${HELMFILE_VERSION}_darwin_arm64.tar.gz" + else + ARCHIVE="helmfile_${HELMFILE_VERSION}_${OS}_${ARCH}.tar.gz" + fi + + URL="https://github.com/helmfile/helmfile/releases/download/v${HELMFILE_VERSION}/${ARCHIVE}" + curl -sSL -o "/tmp/helmfile.tar.gz" "$URL" + tar -xzf /tmp/helmfile.tar.gz -C /tmp + sudo mv /tmp/helmfile /usr/local/bin/helmfile + sudo chmod +x /usr/local/bin/helmfile + rm /tmp/helmfile.tar.gz +fi + +######################################## +# chart-testing (dev mode only) +######################################## +if [[ "$DEV_MODE" == true ]]; then + if ! command -v ct &> /dev/null; then + echo "Installing chart-testing (ct)..." + ARCHIVE="chart-testing_${CT_VERSION}_${OS}_${ARCH}.tar.gz" + URL="https://github.com/helm/chart-testing/releases/download/v${CT_VERSION}/${ARCHIVE}" + curl -sSL -o "/tmp/ct.tar.gz" "$URL" + tar -xzf /tmp/ct.tar.gz -C /tmp + sudo mv /tmp/ct /usr/local/bin/ct + sudo chmod +x /usr/local/bin/ct + rm /tmp/ct.tar.gz + fi +fi + +echo "✅ All tools installed successfully." diff --git a/k8s/guides/prereq/gateway-provider/README.md b/k8s/guides/prereq/gateway-provider/README.md new file mode 100644 index 000000000..4836b9fa8 --- /dev/null +++ b/k8s/guides/prereq/gateway-provider/README.md @@ -0,0 +1,125 @@ +# Gateway Provider Prerequisite + +This document will guide you through configuring a [Kubernetes Gateway](https://gateway-api.sigs.k8s.io/) provider that can support the llm-d [`inference-scheduler`](https://github.com/llm-d/llm-d-inference-scheduler) component. + +The key elements are: + +* The `inference-scheduler` is an **endpoint picker (EPP)** that decides which model server a given request should go to +* The **Inference Gateway `InferencePool` Custom Resource** that provisions and configures an `inference-scheduler` on a Kubernetes cluster +* The **Gateway Custom Resources** that define the Kubernetes-native Gateway API and how traffic reaches an `InferencePool` +* A **compatible Gateway implementation (control plane)** that provisions and configures load balancers and endpoint pickers in response to the Gateway API and InferencePool API + +After this prerequisite is complete you will be able to create `InferencePool` objects on cluster and route traffic to them. + +This prerequisite generally requires cluster administration rights. + +## Why do you need a Gateway? + +The inference scheduler provides an extension to [compatible Gateway providers](https://gateway-api-inference-extension.sigs.k8s.io/implementations/gateways/) that optimizes load balancing of LLM traffic across model server replicas. + +The integration with a Gateway allows self-hosted models to be exposed in a [wide variety of network topologies including](https://gateway-api.sigs.k8s.io/concepts/use-cases/): + +* Internet-facing services +* Internal to your cluster +* Through a service mesh + +and take advantage of key Gateway features like: + +* Traffic splitting for incremental rollout of new models +* TLS encryption of queries and responses + +By integrating with a Gateway -- instead of developing an llm-d specific proxy layer -- llm-d can leverage the high performance of mature proxies like [Envoy](https://www.envoyproxy.io/) and take advantage of existing operational tools for managing traffic to services. + +## Select and install an `inference-scheduler` compatible Gateway implementation + +llm-d requires you select a [Gateway implementation that supports the inference-scheduler](https://gateway-api-inference-extension.sigs.k8s.io/implementations/gateways/). Your infrastructure may provide a default compatible implementation, or you may choose to deploy a gateway implementation onto your cluster. + +### Use an infrastructure provided Gateway implementation + +We recommend using the infrastructure provided Gateway with our guides if available. + +#### Google Kubernetes Engine (GKE) + +GKE automatically enables an inference-compatible Gateway control plane when you enable the `HttpLoadBalancing` addon. + +The key choice for deployment is whether you want to create a regional internal Application Load Balancer - accessible only workloads within your VPC (class name: `gke-l7-rilb`) - or a regional external Application Load Balancer - accessible to the internet (class name: `gke-l7-regional-external-managed`). + +The following steps from the [GKE Inference Gateway deployment documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/deploy-gke-inference-gateway) should be run: + +1. [Verify your prerequisites](https://cloud.google.com/kubernetes-engine/docs/how-to/deploy-gke-inference-gateway#before-you-begin) +2. [Prepare your environment](https://cloud.google.com/kubernetes-engine/docs/how-to/deploy-gke-inference-gateway#prepare-environment) +3. [Create the Gateway](https://cloud.google.com/kubernetes-engine/docs/how-to/deploy-gke-inference-gateway#create-gateway) + +The other steps are optional and are not necessary to continue with your guide. + +### Self-installed Gateway implementations + +llm-d provides a Helm chart that installs and configures the `kgateway` or `istio` Gateway implementations. + +#### Before you begin + +Prior to deploying a Gateway control plane, you must install the custom resource definitions (CRDs) configuration that adds the Kubernetes API objects: + +- [Gateway API v1.3.0 CRDs](https://github.com/kubernetes-sigs/gateway-api/tree/v1.3.0/config/crd) + - for more information see their [docs](https://gateway-api.sigs.k8s.io/guides/) +- [Gateway API Inference Extension CRDs v1.0.1](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/v1.0.1/config/crd) + - for more information see their [docs](https://gateway-api-inference-extension.sigs.k8s.io/) + +We have provided the [`install-gateway-provider-dependencies.sh`](./install-gateway-provider-dependencies.sh) script: + +```bash +./install-gateway-provider-dependencies.sh +``` + +To remove the created dependencies: + +```bash +./install-gateway-provider-dependencies.sh delete +``` + +You may specify any valid git source control reference for versions as `GATEWAY_API_CRD_REVISION` and `GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION`: + +```bash +export GATEWAY_API_CRD_REVISION="v1.2.0" +export GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION="v0.5.0" +./install-gateway-provider-dependencies.sh +``` + +#### Installation + +To install the gateway control plane: + +```bash +helmfile apply -f .helmfile.yaml # options: [`istio`, `kgateway`] +# ex: helmfile apply -f istio.helmfile.yaml +``` + +#### Targeted install + +If the CRDs already exist in your cluster and you do not wish to re-apply them, use the `--selector kind=gateway-control-plane` selector to limit your changes to the infrastructure: + +```bash +# Install +helmfile apply -f --selector kind=gateway-control-plane +# Uninstall +helmfile destroy -f --selector kind=gateway-control-plane +``` + +If you wish to bump versions or customize your installs, check out our helmfiles for [istio](./istio.helmfile.yaml), and [kgateway](./kgateway.helmfile.yaml) respectively. + +### Other Gateway implementations + +For other [compatible Gateway implementations](https://gateway-api-inference-extension.sigs.k8s.io/implementations/gateways/) follow the instructions for your selected Gateway. Ensure the necessary CRDs for Gateway API and the Gateway API Inference Extension are installed. + +## Verify your installation + +Once the prerequisite steps are complete, you should be able to verify that `InferencePool` is installed on your cluster with: + +```bash +# Verify the v1 APIs are installed, specifically InferencePool +kubectl api-resources --api-group=inference.networking.k8s.io +# Verify other APIs are installed +kubectl api-resources --api-group=inference.networking.x-k8s.io +``` + +If successful, the first command should return at least the `v1` version of `InferencePool`, and you should also see a `v1alpha2` or newer version of `InferenceObjective`. diff --git a/k8s/guides/prereq/gateway-provider/common-configurations/README.md b/k8s/guides/prereq/gateway-provider/common-configurations/README.md new file mode 100644 index 000000000..a39ddee04 --- /dev/null +++ b/k8s/guides/prereq/gateway-provider/common-configurations/README.md @@ -0,0 +1,3 @@ +# Gateway Provider Common Configurations + +Each guide pulls in these gateway configurations. They are meant to abstract all the basic values that get set if you are using a gateway of a certain type. Performance related configurations should live in [benchmarking.yaml](./benchmarking.yaml), and can then be combined with environment specific settings to create a benchmarking setup for that given environment. For an example of that see the [inference-scheduling helmfile](../../../inference-scheduling/helmfile.yaml.gotmpl). diff --git a/k8s/guides/prereq/gateway-provider/common-configurations/benchmarking.yaml b/k8s/guides/prereq/gateway-provider/common-configurations/benchmarking.yaml new file mode 100644 index 000000000..8b55d2026 --- /dev/null +++ b/k8s/guides/prereq/gateway-provider/common-configurations/benchmarking.yaml @@ -0,0 +1,42 @@ +# Infra values +gateway: + gatewayParameters: + accessLogging: false + logLevel: error + resources: + limits: + cpu: "4" + memory: 4Gi + requests: + cpu: "1" + memory: 1Gi + +# MS values +routing: + proxy: + debugLevel: 1 + # IF epp is created through ms, set its debugLevel + epp: + debugLevel: 1 + +# GAIE values +inferenceExtension: + flags: + - name: v + value: 1 + # include pool group, whole flags get overwritten + - name: "pool-group" + value: "inference.networking.x-k8s.io" +istio: + destinationRule: + trafficPolicy: + connectionPool: + http: + http1MaxPendingRequests: 256000 + maxRequestsPerConnection: 256000 + http2MaxRequests: 256000 + idleTimeout: "900s" + tcp: + maxConnections: 256000 + maxConnectionDuration: "1800s" + connectTimeout: "900s" diff --git a/k8s/guides/prereq/gateway-provider/common-configurations/gke.yaml b/k8s/guides/prereq/gateway-provider/common-configurations/gke.yaml new file mode 100644 index 000000000..62cf5e962 --- /dev/null +++ b/k8s/guides/prereq/gateway-provider/common-configurations/gke.yaml @@ -0,0 +1,10 @@ +gateway: + gatewayClassName: gke-l7-regional-external-managed + gatewayParameters: + accessLogging: false + +provider: + name: gke + +inferencePool: + apiVersion: "inference.networking.k8s.io/v1" diff --git a/k8s/guides/prereq/gateway-provider/common-configurations/gke_tpu.yaml b/k8s/guides/prereq/gateway-provider/common-configurations/gke_tpu.yaml new file mode 100644 index 000000000..62cf5e962 --- /dev/null +++ b/k8s/guides/prereq/gateway-provider/common-configurations/gke_tpu.yaml @@ -0,0 +1,10 @@ +gateway: + gatewayClassName: gke-l7-regional-external-managed + gatewayParameters: + accessLogging: false + +provider: + name: gke + +inferencePool: + apiVersion: "inference.networking.k8s.io/v1" diff --git a/k8s/guides/prereq/gateway-provider/common-configurations/istio.yaml b/k8s/guides/prereq/gateway-provider/common-configurations/istio.yaml new file mode 100644 index 000000000..431a0795a --- /dev/null +++ b/k8s/guides/prereq/gateway-provider/common-configurations/istio.yaml @@ -0,0 +1,21 @@ +# Infra values +gateway: + gatewayClassName: istio + +# GAIE values +inferenceExtension: + flags: # empty flag overrides, so key always exists to pull values from, its just empty +provider: + name: istio +istio: + destinationRule: + trafficPolicy: + tls: + mode: SIMPLE + insecureSkipVerify: true + +routing: # empty debugLevel overrides, so keys always exist to pull values from, its just empty + proxy: + debugLevel: + epp: + debugLevel: diff --git a/k8s/guides/prereq/gateway-provider/common-configurations/kgateway.yaml b/k8s/guides/prereq/gateway-provider/common-configurations/kgateway.yaml new file mode 100644 index 000000000..48878fbd0 --- /dev/null +++ b/k8s/guides/prereq/gateway-provider/common-configurations/kgateway.yaml @@ -0,0 +1,2 @@ +gateway: + gatewayClassName: kgateway diff --git a/k8s/guides/prereq/gateway-provider/install-gateway-provider-dependencies.sh b/k8s/guides/prereq/gateway-provider/install-gateway-provider-dependencies.sh new file mode 100755 index 000000000..36fde3212 --- /dev/null +++ b/k8s/guides/prereq/gateway-provider/install-gateway-provider-dependencies.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# -*- indent-tabs-mode: nil; tab-width: 2; sh-indentation: 2; -*- + +# This is a script to automate installation and removal of the Gateway API and Gateway API Inference Extension CRDs + +set +x +set -e +set -o pipefail + +if [ -z "$(command -v kubectl)" ]; then + echo "This script depends on \`kubectl\`. Please install it." + exit 1 +fi + +# Logging functions and ASCII colour helpers. +COLOR_RESET=$'\e[0m' +COLOR_GREEN=$'\e[32m' +COLOR_RED=$'\e[31m' + +log_success() { + echo "${COLOR_GREEN}✅ $*${COLOR_RESET}" +} + +log_error() { + echo "${COLOR_RED}❌ $*${COLOR_RESET}" >&2 +} + +## Populate manifests +MODE=${1:-apply} # allowed values "apply" or "delete" +if [[ "$MODE" == "apply" ]]; then + LOG_ACTION_NAME="Installing" +elif [[ "$MODE" == "delete" ]]; then + LOG_ACTION_NAME="Deleting" +else + log_error "Unrecognized Mode: ${MODE}, only supports \`apply\` or \`delete\`." + exit 1 +fi + +GATEWAY_API_CRD_REVISION=${GATEWAY_API_CRD_REVISION:-"v1.3.0"} +GATEWAY_API_CRD_REF="?ref=${GATEWAY_API_CRD_REVISION}" +### Base CRDs +log_success "📜 Base CRDs: ${LOG_ACTION_NAME}..." +kubectl $MODE -k https://github.com/kubernetes-sigs/gateway-api/config/crd/${GATEWAY_API_CRD_REF} || true + + +GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION=${GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION:-"v1.0.1"} +GATEWAY_API_INFERENCE_EXTENSION_CRD_REF="?ref=${GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION}" +### GAIE CRDs +log_success "🚪 GAIE CRDs: ${LOG_ACTION_NAME}..." +kubectl $MODE -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd/${GATEWAY_API_INFERENCE_EXTENSION_CRD_REF} || true diff --git a/k8s/guides/prereq/gateway-provider/istio.helmfile.yaml b/k8s/guides/prereq/gateway-provider/istio.helmfile.yaml new file mode 100644 index 000000000..8501901f9 --- /dev/null +++ b/k8s/guides/prereq/gateway-provider/istio.helmfile.yaml @@ -0,0 +1,30 @@ +releases: + - name: istio-base + chart: oci://gcr.io/istio-testing/charts/base + version: 1.28-alpha.89f30b26ba71bf5e538083a4720d0bc2d8c06401 + namespace: istio-system + installed: true + labels: + type: gateway-provider + kind: gateway-crds + + - name: istiod + chart: oci://gcr.io/istio-testing/charts/istiod + version: 1.28-alpha.89f30b26ba71bf5e538083a4720d0bc2d8c06401 + namespace: istio-system + installed: true + needs: + - istio-system/istio-base + values: + - meshConfig: + defaultConfig: + proxyMetadata: + SUPPORT_GATEWAY_API_INFERENCE_EXTENSION: "true" + pilot: + env: + SUPPORT_GATEWAY_API_INFERENCE_EXTENSION: "true" + tag: 1.28-alpha.89f30b26ba71bf5e538083a4720d0bc2d8c06401 + hub: "gcr.io/istio-testing" + labels: + type: gateway-provider + kind: gateway-control-plane diff --git a/k8s/guides/prereq/gateway-provider/kgateway.helmfile.yaml b/k8s/guides/prereq/gateway-provider/kgateway.helmfile.yaml new file mode 100644 index 000000000..bbf3935e7 --- /dev/null +++ b/k8s/guides/prereq/gateway-provider/kgateway.helmfile.yaml @@ -0,0 +1,31 @@ +releases: + - name: kgateway-crds + chart: oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + namespace: kgateway-system + version: v2.0.4 + installed: true + labels: + type: gateway-provider + kind: gateway-crds + + - name: kgateway + chart: oci://cr.kgateway.dev/kgateway-dev/charts/kgateway + version: v2.0.4 + namespace: kgateway-system + installed: true + needs: + - kgateway-system/kgateway-crds + values: + - inferenceExtension: + enabled: true + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + podSecurityContext: + seccompProfile: + type: "RuntimeDefault" + runAsNonRoot: true + labels: + type: gateway-provider + kind: gateway-control-plane diff --git a/k8s/guides/prereq/infrastructure/README.md b/k8s/guides/prereq/infrastructure/README.md new file mode 100644 index 000000000..551b4fab9 --- /dev/null +++ b/k8s/guides/prereq/infrastructure/README.md @@ -0,0 +1,90 @@ +# Infrastructure Prerequisite + +This document will guide you through choosing the Kubernetes infrastructure to run llm-d. It covers both the fundamental hardware and software requirements for llm-d, cluster configuration, as well as configuration specific to infrastructure providers (clouds, specific distributions). + +## llm-d infrastructure + +llm-d tests on the following configurations, supporting leading-edge AI accelerators: + +* Kubernetes: 1.29 or newer + * Your cluster scheduler must support placing multiple pods within the same networking domain for running multi-host inference +* Recent generation datacenter-class accelerators + * AMD MI250X or newer + * Google TPU v5e, v6e, and newer + * NVIDIA L4, A100, H100, H200, B200, and newer +* Fast internode networking + * For accelerators + * AMD Infinity Fabric, InfiniBand NICs + * Google TPU ICI + * NVIDIA NVLink, InfiniBand or RoCE NICs + * For hosts and north/south traffic + * Fast (100Gbps+ aggregate throughput) datacenter NICs +* Hosts + * 80+ x86 or ARM cores per machine + * 500GiB or more of memory + * PCIe 5+ + +Older configurations may function, especially slightly older accelerators, but testing is best-effort. + +### (Optional) vLLM container image + +llm-d provides container images derived from the [vLLM upstream](https://github.com/vllm-project/vllm/tree/main/docker) that are tested with the supported hardware and have all necessary optimized libraries installed. To build and deploy with your own image, you should integrate: + +* General + * vLLM: 0.10.0 or newer + * NIXL: 0.5.0 or newer + * UCX: 0.19.0 or newer +* NVIDIA-specific + * NVSHMEM: 3.3.9 or newer + +llm-d guides expect a series of conventions to be followed in the vLLM image: + +* General + * At least one vLLM compatible Python version must be available (3.9 to 3.12) + * We recommend at least 3.10+ + * Required system libraries must be bundled + * `LD_LIBRARY_PATH` must contain all necessary system libraries for vLLM to function + * `PATH` must contain the vLLM binary and directly invoking `vllm` should start with the correct Python environment (i.e. a virtual env) + * The default image command (or if not specified, entrypoint) should start vLLM in a serving configuration and accept additional arguments + * A pod with `args` should see all arguments passed to vLLM + * A pod with `command: ["vllm", "serve"]` should override any image defaults +* Caches + * Default compilation cache directory environment variables under a shared root path under `/tmp/cache/compile/` + * I.e. set `VLLM_CACHE_ROOT=/tmp/cache/compile/vllm` to ensure vLLM compiles to a temporary directory + * Future versions of vLLM will recommend mounting a pod volume to `/tmp/cache` to mitigate restart for some caches. + * Do not hardcode the model cache directory and model cache environment variables + * Future versions of llm-d will provide conventions for vLLM model loading +* Hardware + * Follow best practices for your hardware ecosystem, including: + * Expecting to mount hardware-specific drivers and libraries from a standard host location as a value + * Ahead Of Time (AOT) compilation of kernels + * NVIDIA specific + * `LD_LIBRARY_PATH` includes the `/usr/local/nvidia/lib64` directory to allow Kubernetes GPU operators to inject the appropriate driver + +### (Optional) Install LeaderWorkerSet for multi-host inference + +The LeaderWorkerSet (LWS) Kubernetes workload controller specializes in deploying serving workloads where each replica is composed of multiple pods spread across hosts, specifically accelerator nodes. llm-d defaults to LWS for deployment of multi-host inference for rank to pod mappings, topology aware placement to ensure optimal accelerator network performance, and all-or-nothing failure and restart semantics to recover in the event of a bad node or accelerator. + +Use the [LWS installation guide](https://lws.sigs.k8s.io/docs/installation/) to install 0.7.0 or newer when deploying an llm-d guide using LWS. + +## Installing on a well-lit infrastructure provider + +The following documentation describes llm-d tested setup for cluster infrastructure providers as well as specific deployment settings that will impact how model servers is expected to access accelerators. + +* [DigitalOcean Kubernetes (DOKS)](../../../docs/infra-providers/digitalocean/README.md) +* [Google Kubernetes Engine (GKE)](../../../docs/infra-providers/gke/README.md) +* [OpenShift (OCP)](../../../docs/infra-providers/openshift/README.md), [OpenShift on AWS](../../../docs/infra-providers/openshift-aws/README.md) +* [minikube](../../../docs/infra-providers/minikube/README.md) for single-host development + +These provider configurations are tested regularly. + +Please follow the provider-specific documentation to ensure your Kubernetes cluster and hardware is properly configured before continuing. + +## Other providers + +To add a new infrastructure provider to our well-lit paths, we request the following support: + +* Documentation on configuring the platform to support one or more [well-lit path guides](../../README.md#well-lit-path-guides) +* The appropriate configuration contributed to the guide to deal with provider specific variation +* An automated test environment that validates the supported guides +* At least one documented platform maintainer who responds to GitHub issues and is available for regular discussion in the llm-d slack channel `#sig-installation`. \ No newline at end of file diff --git a/k8s/guides/simulated-accelerators/README.md b/k8s/guides/simulated-accelerators/README.md new file mode 100644 index 000000000..8ccbd4fae --- /dev/null +++ b/k8s/guides/simulated-accelerators/README.md @@ -0,0 +1,125 @@ +# Feature: llm-d Accelerator Simulation + +## Overview + +Conducting large scale testing of AI/ML workloads is difficult when capacity is limited or already committed to production workloads. `llm-d` provides a lightweight model server that mimics the behavior of executing inference without requiring an attached accelerator. This simulated server can be run in wide or dense configurations on CPU-only machines to validate the correct behavior of other parts of the system, including Kubernetes autoscaling and the `inference-scheduler`. + +This guide demonstrates how to deploy the simulator `ghcr.io/llm-d/llm-d-inference-sim` image and generate inference responses. + +## Prerequisites + +- Have the [proper client tools installed on your local system](../prereq/client-setup/README.md) to use this guide. +- Configure and deploy your [Gateway control plane](../prereq/gateway-provider/README.md). +- Have the [Monitoring stack](../../docs/monitoring/README.md) installed on your system. + +**_NOTE:_** Unlike other examples which require models, the simulator stubs the vLLM server and so no HuggingFace token is needed. + +## Installation + +Use the helmfile to compose and install the stack. The Namespace in which the stack will be deployed will be derived from the `${NAMESPACE}` environment variable. If you have not set this, it will default to `llm-d-sim` in this example. + +```bash +export NAMESPACE=llm-d-sim # Or any namespace your heart desires +cd guides/simulated-accelerators +helmfile apply -n ${NAMESPACE} +``` + +**_NOTE:_** You can set the `$RELEASE_NAME_POSTFIX` env variable to change the release names. This is how we support concurrent installs. ex: `RELEASE_NAME_POSTFIX=sim-2 helmfile apply -n ${NAMESPACE}` + +**_NOTE:_** This uses Istio as the default provider, see [Gateway Options](./README.md#gateway-options) for installing with a specific provider. + +### Gateway options + +To see specify your gateway choice you can use the `-e ` flag, ex: + +```bash +helmfile apply -e kgateway -n ${NAMESPACE} +``` + +To see what gateway options are supported refer to our [gateway provider prereq doc](../prereq/gateway-provider/README.md#supported-providers). Gateway configurations per provider are tracked in the [gateway-configurations directory](../prereq/gateway-provider/common-configurations/). + +You can also customize your gateway, for more information on how to do that see our [gateway customization docs](../../docs/customizing-your-gateway.md). + +### Install HTTPRoute + +Follow provider specific instructions for installing HTTPRoute. + +#### Install for "kgateway" or "istio" + +```bash +kubectl apply -f httproute.yaml -n ${NAMESPACE} +``` + +#### Install for "gke" + +```bash +kubectl apply -f httproute.gke.yaml -n ${NAMESPACE} +``` + +## Verify the Installation + +- Firstly, you should be able to list all helm releases to view the 3 charts got installed into your chosen namespace: + +```bash +helm list -n ${NAMESPACE} +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gaie-sim llm-d-sim 1 2025-08-24 11:44:26.88254 -0700 PDT deployed inferencepool-v1.0.1 v1.0.1 +infra-sim llm-d-sim 1 2025-08-24 11:44:23.11688 -0700 PDT deployed llm-d-infra-v1.3.3 v0.3.0 +ms-sim llm-d-sim 1 2025-08-24 11:44:32.17112 -0700 PDT deployed llm-d-modelservice-v0.2.9 v0.2.0 +``` + +- Out of the box with this example you should have the following resources: + +```bash +kubectl get all -n ${NAMESPACE} +NAME READY STATUS RESTARTS AGE +pod/gaie-sim-epp-694bdbd44c-4sh92 1/1 Running 0 7m14s +pod/infra-sim-inference-gateway-istio-68d59c4778-n6n5l 1/1 Running 0 7m19s +pod/ms-sim-llm-d-modelservice-decode-674774f45d-hhlxl 2/2 Running 0 7m10s +pod/ms-sim-llm-d-modelservice-decode-674774f45d-p5lsx 2/2 Running 0 7m10s +pod/ms-sim-llm-d-modelservice-decode-674774f45d-zpp84 2/2 Running 0 7m10s +pod/ms-sim-llm-d-modelservice-prefill-76c86dd9f8-pvbzm 1/1 Running 0 7m10s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/gaie-sim-epp ClusterIP 10.16.0.143 9002/TCP,9090/TCP 7m14s +service/gaie-sim-ip-207d1d4c ClusterIP None 54321/TCP 7m14s +service/infra-sim-inference-gateway-istio LoadBalancer 10.16.1.112 10.16.4.2 15021:33302/TCP,80:31413/TCP 7m19s + +NAME READY UP-TO-DATE AVAILABLE AGE +deployment.apps/gaie-sim-epp 1/1 1 1 7m14s +deployment.apps/infra-sim-inference-gateway-istio 1/1 1 1 7m19s +deployment.apps/ms-sim-llm-d-modelservice-decode 3/3 3 3 7m10s +deployment.apps/ms-sim-llm-d-modelservice-prefill 1/1 1 1 7m10s + +NAME DESIRED CURRENT READY AGE +replicaset.apps/gaie-sim-epp-694bdbd44c 1 1 1 7m15s +replicaset.apps/infra-sim-inference-gateway-istio-68d59c4778 1 1 1 7m20s +replicaset.apps/ms-sim-llm-d-modelservice-decode-674774f45d 3 3 3 7m11s +replicaset.apps/ms-sim-llm-d-modelservice-prefill-76c86dd9f8 1 1 1 7m11s +``` + +**_NOTE:_** This assumes no other guide deployments in your given `${NAMESPACE}`. + +## Using the stack + +For instructions on getting started making inference requests see [our docs](../../docs/getting-started-inferencing.md) + +## Cleanup + +To remove the deployment: + +```bash +# From examples/sim +helmfile destroy -n ${NAMESPACE} + +# Or uninstall manually +helm uninstall infra-sim -n ${NAMESPACE} +helm uninstall gaie-sim -n ${NAMESPACE} +helm uninstall ms-sim -n ${NAMESPACE} +``` + +**_NOTE:_** If you set the `$RELEASE_NAME_POSTFIX` environment variable, your release names will be different from the command above: `infra-$RELEASE_NAME_POSTFIX`, `gaie-$RELEASE_NAME_POSTFIX` and `ms-$RELEASE_NAME_POSTFIX`. + +## Customization + +For information on customizing a guide and tips to build your own, see [our docs](../../docs/customizing-a-guide.md) diff --git a/k8s/guides/simulated-accelerators/gaie-sim/values.yaml b/k8s/guides/simulated-accelerators/gaie-sim/values.yaml new file mode 100644 index 000000000..4c11e3f9b --- /dev/null +++ b/k8s/guides/simulated-accelerators/gaie-sim/values.yaml @@ -0,0 +1,33 @@ +inferenceExtension: + replicas: 1 + image: + # both downstream infernece-scheduler and upstream epp image can support simulated-accelerators example + ################### + name: llm-d-inference-scheduler + hub: ghcr.io/llm-d + tag: v0.3.2 + ################### + # name: epp + # hub: registry.k8s.io/gateway-api-inference-extension + # tag: v1.0.1 + ################### + pullPolicy: Always + extProcPort: 9002 + pluginsConfigFile: "default-plugins.yaml" # using upstream GIE default-plugins, see: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/charts/inferencepool/templates/epp-config.yaml#L7C3-L56C33 + + # Monitoring configuration for EPP + monitoring: + interval: "10s" + # Service account token secret for authentication + secret: + name: sim-gateway-sa-metrics-reader-secret + # Prometheus ServiceMonitor will be created when enabled for EPP metrics collection + prometheus: + enabled: true +inferencePool: + targetPortNumber: 8000 + apiVersion: inference.networking.x-k8s.io/v1alpha2 # use old API version for inference + modelServerType: vllm + modelServers: + matchLabels: + llm-d.ai/inferenceServing: "true" diff --git a/k8s/guides/simulated-accelerators/helmfile.yaml.gotmpl b/k8s/guides/simulated-accelerators/helmfile.yaml.gotmpl new file mode 100644 index 000000000..611fc1c9e --- /dev/null +++ b/k8s/guides/simulated-accelerators/helmfile.yaml.gotmpl @@ -0,0 +1,91 @@ +environments: + istio: &I + values: + - ../prereq/gateway-provider/common-configurations/istio.yaml + kgateway: &KG + values: + - ../prereq/gateway-provider/common-configurations/kgateway.yaml + gke: &GKE + values: + - ../prereq/gateway-provider/common-configurations/gke.yaml + default: + <<: *I + +--- + +{{- $ns := .Namespace | default "llm-d-sim" -}} +{{- $rn := (env "RELEASE_NAME_POSTFIX") | default "sim" -}} + +repositories: + - name: llm-d-modelservice + url: https://llm-d-incubation.github.io/llm-d-modelservice/ + - name: llm-d-infra + url: https://llm-d-incubation.github.io/llm-d-infra/ + +releases: + - name: {{ printf "infra-%s" $rn | quote }} + namespace: {{ $ns }} + chart: llm-d-infra/llm-d-infra + version: v1.3.3 + installed: true + labels: + type: infrastructure + kind: inference-stack + values: + - gateway: + {{ .Environment.Values.gateway | toYaml | nindent 10 }} + + - name: {{ printf "gaie-%s" $rn | quote }} + namespace: {{ $ns }} + chart: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + version: v1.0.1 + installed: true + needs: + - {{ printf "infra-%s" $rn | quote }} + values: + - gaie-sim/values.yaml + {{- if eq .Environment.Name "gke" }} + - provider: + name: {{ .Environment.Values.provider.name }} + inferencePool: + apiVersion: {{ .Environment.Values.inferencePool.apiVersion }} + inferenceExtension: + monitoring: + gke: + enabled: true + prometheus: + enabled: false + {{- else if or (eq .Environment.Name "istio") (eq .Environment.Name "default") }} + - provider: + name: {{ .Environment.Values.provider.name }} + - istio: + {{ .Environment.Values.istio | toYaml | nindent 10 }} + - istio: + destinationRule: + host: {{ printf "gaie-%s-epp.%s.svc.cluster.local" $rn $ns | quote }} + {{- end }} + labels: + kind: inference-stack + + - name: {{ printf "ms-%s" $rn | quote }} + namespace: {{ $ns }} + chart: llm-d-modelservice/llm-d-modelservice + version: v0.2.11 + installed: true + needs: + - {{ printf "infra-%s" $rn | quote }} + - {{ printf "gaie-%s" $rn | quote }} + values: + - ms-sim/values.yaml + set: + # apply release name derived values + - name: "routing.inferencePool.name" + value: {{ printf "gaie-%s" $rn | quote }} + - name: "routing.parentRefs[0].name" + value: {{ printf "infra-%s-inference-gateway" $rn | quote }} + {{- if (eq .Environment.Name "gke") }} + - name: "decode.monitoring.podmonitor.enabled" + value: false + {{- end }} + labels: + kind: inference-stack diff --git a/k8s/guides/simulated-accelerators/httproute.gke.yaml b/k8s/guides/simulated-accelerators/httproute.gke.yaml new file mode 100644 index 000000000..bd587c8a0 --- /dev/null +++ b/k8s/guides/simulated-accelerators/httproute.gke.yaml @@ -0,0 +1,19 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-d-sim +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-sim-inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: gaie-sim + weight: 1 + matches: + - path: + type: PathPrefix + value: /