From fa77f0673a81e271f5da6ef36e45b39ac1c75f2b Mon Sep 17 00:00:00 2001 From: Sean Sullivan Date: Mon, 25 Aug 2025 17:17:44 +0000 Subject: [PATCH] hpa recipe for ai inference using gpu custom metrics --- AI/vllm-deployment/hpa/.gitignore | 1 + AI/vllm-deployment/hpa/README.md | 97 +++++++++ .../hpa/gpu-dcgm-exporter-service.yaml | 37 ++++ .../hpa/gpu-horizontal-pod-autoscaler.yaml | 65 ++++++ AI/vllm-deployment/hpa/gpu-hpa.md | 206 ++++++++++++++++++ .../hpa/gpu-service-monitor.yaml | 34 +++ .../hpa/horizontal-pod-autoscaler.yaml | 48 ++++ .../hpa/prometheus-adapter.yaml | 200 +++++++++++++++++ AI/vllm-deployment/hpa/prometheus-rule.yaml | 41 ++++ AI/vllm-deployment/hpa/request-looper.sh | 52 +++++ AI/vllm-deployment/hpa/vllm-hpa.md | 180 +++++++++++++++ .../hpa/vllm-service-monitor.yaml | 33 +++ 12 files changed, 994 insertions(+) create mode 100644 AI/vllm-deployment/hpa/.gitignore create mode 100644 AI/vllm-deployment/hpa/README.md create mode 100644 AI/vllm-deployment/hpa/gpu-dcgm-exporter-service.yaml create mode 100644 AI/vllm-deployment/hpa/gpu-horizontal-pod-autoscaler.yaml create mode 100644 AI/vllm-deployment/hpa/gpu-hpa.md create mode 100644 AI/vllm-deployment/hpa/gpu-service-monitor.yaml create mode 100644 AI/vllm-deployment/hpa/horizontal-pod-autoscaler.yaml create mode 100644 AI/vllm-deployment/hpa/prometheus-adapter.yaml create mode 100644 AI/vllm-deployment/hpa/prometheus-rule.yaml create mode 100755 AI/vllm-deployment/hpa/request-looper.sh create mode 100644 AI/vllm-deployment/hpa/vllm-hpa.md create mode 100644 AI/vllm-deployment/hpa/vllm-service-monitor.yaml diff --git a/AI/vllm-deployment/hpa/.gitignore b/AI/vllm-deployment/hpa/.gitignore new file mode 100644 index 000000000..ccf4ddc2d --- /dev/null +++ b/AI/vllm-deployment/hpa/.gitignore @@ -0,0 +1 @@ +GEMINI.md diff --git a/AI/vllm-deployment/hpa/README.md b/AI/vllm-deployment/hpa/README.md new file mode 100644 index 000000000..19fab8966 --- /dev/null +++ b/AI/vllm-deployment/hpa/README.md @@ -0,0 +1,97 @@ +# Horizontal Pod Autoscaling AI Inference Server + +This exercise shows how to set up the infrastructure to automatically +scale an AI inference server, using custom metrics (either server +or GPU metrics). This exercise requires a running Prometheus instance, +preferably managed by the Prometheus Operator. We assume +you already have the vLLM AI inference server running from this +[exercise](../README.md), in the parent directory. + +## Architecture + +The autoscaling solution works as follows: + +1. The **vLLM Server** or the **NVIDIA DCGM Exporter** exposes raw metrics on a `/metrics` endpoint. +2. A **ServiceMonitor** resource declaratively specifies how Prometheus should discover and scrape these metrics. +3. The **Prometheus Operator** detects the `ServiceMonitor` and configures its managed **Prometheus Server** instance to begin scraping the metrics. +4. For GPU metrics, a **PrometheusRule** is used to relabel the raw DCGM metrics, creating a new, HPA-compatible metric. +5. The **Prometheus Adapter** queries the Prometheus Server for the processed metrics and exposes them through the Kubernetes custom metrics API. +6. The **Horizontal Pod Autoscaler (HPA)** controller queries the custom metrics API for the metrics and compares them to the target values defined in the `HorizontalPodAutoscaler` resource. +7. If the metrics exceed the target, the HPA scales up the `vllm-gemma-deployment`. + + +```mermaid +flowchart TD + D("PrometheusRule (GPU Metric Only)") + B("Prometheus Server") + C("ServiceMonitor") + subgraph subGraph0["Metrics Collection"] + A["vLLM Server"] + H["GPU DCGM Exporter"] + end + subgraph subGraph1["HPA Scaling Logic"] + E("Prometheus Adapter") + F("API Server (Custom Metrics)") + G("HPA Controller") + end + B -- Scrapes Raw Metrics --> A + B -- Scrapes Raw Metrics --> H + C -- Configures Scrape <--> B + B -- Processes Raw Metrics via --> D + D -- Creates Clean Metric in --> B + F -- Custom Metrics API <--> E + E -- Queries Processed Metric <--> B + G -- Queries Custom Metric --> F +``` + + +## Prerequisites + +This guide assumes you have a running Kubernetes cluster and `kubectl` installed. The vLLM server will be deployed in the `vllm-example` namespace, and the Prometheus resources will be in the `monitoring` namespace. The HPA resources will be deployed to the `vllm-example` namespace by specifying the namespace on the command line. + +> **Note on Cluster Permissions:** This exercise requires permissions to install components that run on the cluster nodes themselves. The Prometheus Operator and the NVIDIA DCGM Exporter both deploy DaemonSets that require privileged access to the nodes to collect metrics. For GKE users, this means a **GKE Standard** cluster is required, as GKE Autopilot's security model restricts this level of node access. + +### Prometheus Operator Installation + +The following commands will install the Prometheus Operator. It is recommended to install it in its own `monitoring` namespace. + +```bash +# Add the Prometheus community Helm repository +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts/ +helm repo update + +# Install the Prometheus Operator into the "monitoring" namespace +helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace +``` +**Note:** The default configuration of the Prometheus Operator only watches for `ServiceMonitor` resources within its own namespace. The `vllm-service-monitor.yaml` is configured to be in the `monitoring` namespace and watch for services in the `vllm-example` namespace, so no extra configuration is needed. + +## I. HPA for vLLM AI Inference Server using vLLM metrics + +[vLLM AI Inference Server HPA](./vllm-hpa.md) + +## II. HPA for vLLM AI Inference Server using NVidia GPU metrics + +[vLLM AI Inference Server HPA with GPU metrics](./gpu-hpa.md) + +### Choosing the Right Metric: Trade-offs and Combining Metrics + +This project provides two methods for autoscaling: one based on the number of running requests (`vllm:num_requests_running`) and the other on GPU utilization (`dcgm_fi_dev_gpu_util`). Each has its own advantages, and they can be combined for a more robust scaling strategy. + +#### **Trade-offs** + +* **Number of Running Requests (Application-Level Metric):** + * **Pros:** This is a direct measure of the application's current workload. It is highly responsive to sudden changes in traffic, making it ideal for latency-sensitive applications. Scaling decisions are based on the actual number of requests being processed, which can be a more accurate predictor of future load than hardware utilization alone. + * **Cons:** This metric may not always correlate directly with resource consumption. For example, a few computationally expensive requests could saturate the GPU, while a large number of simple requests might not. If the application has issues reporting this metric, the HPA will not be able to scale the deployment correctly. + +* **GPU Utilization (Hardware-Level Metric):** + * **Pros:** This provides a direct measurement of how busy the underlying hardware is. It is a reliable indicator of resource saturation and is useful for optimizing costs by scaling down when the GPU is underutilized. + * **Cons:** GPU utilization can be a lagging indicator. By the time utilization is high, the application's latency may have already increased. It also does not distinguish between a single, intensive request and multiple, less demanding ones. + +#### **Combining Metrics for Robustness** + +For the most robust autoscaling, you can configure the HPA to use multiple metrics. For example, you could scale up if *either* the number of running requests exceeds a certain threshold *or* if GPU utilization spikes. The HPA will scale the deployment up if any of the metrics cross their defined thresholds, but it will only scale down when *all* metrics are below their target values (respecting the scale-down stabilization window). + +This combined approach provides several benefits: +- **Proactive Scaling:** The HPA can scale up quickly in response to an increase in running requests, preventing latency spikes. +- **Resource Protection:** It can also scale up if a small number of requests are consuming a large amount of GPU resources, preventing the server from becoming overloaded. +- **Cost-Effective Scale-Down:** The deployment will only scale down when both the request load and GPU utilization are low, ensuring that resources are not removed prematurely. diff --git a/AI/vllm-deployment/hpa/gpu-dcgm-exporter-service.yaml b/AI/vllm-deployment/hpa/gpu-dcgm-exporter-service.yaml new file mode 100644 index 000000000..80e027233 --- /dev/null +++ b/AI/vllm-deployment/hpa/gpu-dcgm-exporter-service.yaml @@ -0,0 +1,37 @@ +# This Service provides a stable network endpoint for the NVIDIA DCGM Exporter +# pods. The Prometheus Operator's ServiceMonitor will target this Service +# to discover and scrape the GPU metrics. This is especially important +# because the exporter pods are part of a DaemonSet, and their IPs can change. +# +# NOTE: This configuration is specific to GKE, which automatically deploys the +# DCGM exporter in the 'gke-managed-system' namespace. For other cloud +# providers or on-premise clusters, you would need to deploy your own DCGM +# exporter (e.g., via a Helm chart) and update this Service's 'namespace' +# and 'labels' to match your deployment. + +apiVersion: v1 +kind: Service +metadata: + name: gke-managed-dcgm-exporter + # GKE-SPECIFIC: GKE deploys its managed DCGM exporter in this namespace. + # On other platforms, this would be the namespace where you deploy the exporter. + namespace: gke-managed-system + labels: + # This label is critical. The ServiceMonitor uses this label to find this + # specific Service. If the labels don't match, Prometheus will not be + # able to discover the metrics endpoint. + # GKE-SPECIFIC: This label is used by GKE's managed service. For a custom + # deployment, you would use a more generic label like 'nvidia-dcgm-exporter'. + app.kubernetes.io/name: gke-managed-dcgm-exporter +spec: + selector: + # This selector tells the Service which pods to route traffic to. + # It must match the labels on the DCGM exporter pods. + # GKE-SPECIFIC: This selector matches the labels on GKE's managed DCGM pods. + app.kubernetes.io/name: gke-managed-dcgm-exporter + ports: + - # The 'name' of this port is important. The ServiceMonitor will specifically + # look for a port with this name to scrape metrics from. + name: metrics + port: 9400 + targetPort: 9400 \ No newline at end of file diff --git a/AI/vllm-deployment/hpa/gpu-horizontal-pod-autoscaler.yaml b/AI/vllm-deployment/hpa/gpu-horizontal-pod-autoscaler.yaml new file mode 100644 index 000000000..23cfae790 --- /dev/null +++ b/AI/vllm-deployment/hpa/gpu-horizontal-pod-autoscaler.yaml @@ -0,0 +1,65 @@ +# This HorizontalPodAutoscaler (HPA) targets the vLLM deployment and scales +# it based on the average GPU utilization across all pods. It uses the +# custom metric 'gpu_utilization_percent', which is provided by the +# Prometheus Adapter. + +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: gemma-server-gpu-hpa +spec: + # scaleTargetRef points the HPA to the deployment it needs to scale. + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: vllm-gemma-deployment + minReplicas: 1 + maxReplicas: 5 + metrics: + - type: Pods + pods: + metric: + # This is the custom metric that the HPA will query. + # IMPORTANT: This name ('gpu_utilization_percent') is not the raw metric + # from the DCGM exporter. It is the clean, renamed metric that is + # exposed by the Prometheus Adapter. The names must match exactly. + name: gpu_utilization_percent + target: + type: AverageValue + # This is the target value for the metric. The HPA will add or remove + # pods to keep the average GPU utilization across all pods at 20%. + averageValue: 20 + behavior: + scaleUp: + # The stabilizationWindowSeconds is set to 0 to allow for immediate + # scaling up. This is a trade-off: + # - For highly volatile workloads, immediate scaling is critical to + # maintain performance and responsiveness. + # - However, this also introduces a risk of over-scaling if the workload + # spikes are very brief. A non-zero value would make the scaling + # less sensitive to short-lived spikes, but could introduce latency + # if the load persists. + stabilizationWindowSeconds: 0 + policies: + - type: Pods + value: 4 + periodSeconds: 15 + - type: Percent + value: 100 + periodSeconds: 15 + selectPolicy: Max + scaleDown: + # The stabilizationWindowSeconds is set to 30 to prevent the HPA from + # scaling down too aggressively. This means the controller will wait for + # 30 seconds after a scale-down event before considering another one. + # This helps to smooth out the scaling behavior and prevent "flapping" + # (rapidly scaling up and down). A larger value will make the scaling + # more conservative, which can be useful for workloads with fluctuating + # metrics, but it may also result in higher costs if the resources are + # not released quickly after a load decrease. + stabilizationWindowSeconds: 30 + policies: + - type: Percent + value: 100 + periodSeconds: 15 + selectPolicy: Max \ No newline at end of file diff --git a/AI/vllm-deployment/hpa/gpu-hpa.md b/AI/vllm-deployment/hpa/gpu-hpa.md new file mode 100644 index 000000000..a41e2e6c1 --- /dev/null +++ b/AI/vllm-deployment/hpa/gpu-hpa.md @@ -0,0 +1,206 @@ +# Autoscaling an AI Inference Server with HPA using NVIDIA GPU Metrics + +This guide provides a detailed walkthrough for configuring a Kubernetes Horizontal Pod Autoscaler (HPA) to dynamically scale a vLLM AI inference server based on NVIDIA GPU utilization. The autoscaling logic is driven by the `DCGM_FI_DEV_GPU_UTIL` metric, which is exposed by the NVIDIA Data Center GPU Manager (DCGM) Exporter. This approach allows the system to scale based on the actual hardware utilization of the GPU, providing a reliable indicator of workload intensity. + +This guide assumes you have already deployed the vLLM inference server from the [parent directory's exercise](../README.md) into the `vllm-example` namespace. + +--- + +## 1. Verify GPU Metric Collection + +The first step is to ensure that GPU metrics are being collected and exposed within the cluster. This is handled by the NVIDIA DCGM Exporter, which runs as a DaemonSet on GPU-enabled nodes and scrapes metrics directly from the GPU hardware. The method for deploying this exporter varies across cloud providers. + +### 1.1. Cloud Provider DCGM Exporter Setup + +Below are the common setups for GKE, AKS, and EKS. + +#### Google Kubernetes Engine (GKE) + +On GKE, the DCGM exporter is a managed add-on that is automatically deployed and managed by the system. It runs in the `gke-managed-system` namespace. + +**Verification:** +You can verify that the exporter pods are running with the following command: +```bash +kubectl get pods --namespace gke-managed-system | grep dcgm-exporter +``` +You should see one or more `dcgm-exporter` pods in a `Running` state. + +#### Amazon Elastic Kubernetes Service (EKS) & Microsoft Azure Kubernetes Service (AKS) + +On both EKS and AKS, the DCGM exporter is not a managed service and must be installed manually. The standard method is to use the official NVIDIA DCGM Exporter Helm chart, which deploys the exporter as a DaemonSet. + +**Installation (for both EKS and AKS):** +If you don't already have the exporter installed, you can do so with the following Helm commands: +```bash +helm repo add nvdp https://nvidia.github.io/k8s-device-plugin +helm repo update +helm install dcgm-exporter nvdp/dcgm-exporter --namespace monitoring +``` +*Note: We are installing it into the `monitoring` namespace to keep all monitoring-related components together.* + +**Verification:** +You can verify that the exporter pods are running in the `monitoring` namespace: +```bash +kubectl get pods --namespace monitoring | grep dcgm-exporter +``` +You should see one or more `dcgm-exporter` pods in a `Running` state. + +--- + +## 2. Set Up Prometheus for Metric Collection + +With the metric source confirmed, the next step is to configure Prometheus to scrape, process, and store these metrics. + +### 2.1. Install the Prometheus Operator + +The Prometheus Operator can be easily installed using its official Helm chart. This will deploy a full monitoring stack into the `monitoring` namespace. If you have already installed it in the previous exercise, you can skip this step. + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts/ +helm repo update +helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace +``` + +### 2.2. Create a Service for the DCGM Exporter + +The `ServiceMonitor` needs a stable network endpoint to reliably scrape metrics from the DCGM exporter pods. A Kubernetes Service provides this stable endpoint. + +Apply the service manifest: +```bash +kubectl apply -f ./gpu-dcgm-exporter-service.yaml +``` + +Verify that the service has been created successfully: +```bash +kubectl get svc -n gke-managed-system | grep gke-managed-dcgm-exporter +``` + +### 2.3. Configure Metric Scraping with a `ServiceMonitor` + +The `ServiceMonitor` tells the Prometheus Operator to scrape the DCGM exporter Service. + +```bash +kubectl apply -f ./gpu-service-monitor.yaml +``` + +### 2.4. Create a Prometheus Rule for Metric Relabeling + +This is a critical step. The raw `DCGM_FI_DEV_GPU_UTIL` metric does not have the standard `pod` and `namespace` labels the HPA needs. This `PrometheusRule` creates a *new*, correctly-labelled metric named `gke_dcgm_fi_dev_gpu_util_relabelled` that the Prometheus Adapter can use. + +```bash +kubectl apply -f ./prometheus-rule.yaml +``` + +### 2.5. Verify Metric Collection and Relabeling in Prometheus + +To ensure the entire pipeline is working, you must verify that the *new*, relabelled metric exists. First, establish a port-forward to the Prometheus service. + +```bash +kubectl port-forward svc/prometheus-kube-prometheus-prometheus 9090:9090 -n monitoring +``` + +In a separate terminal, use `curl` to query for the new metric. +```bash +# Query Prometheus for the new, relabelled metric +curl -sS "http://localhost:9090/api/v1/query?query=gke_dcgm_fi_dev_gpu_util_relabelled" | jq +``` +A successful verification will show the metric in the `result` array, complete with the correct `pod` and `namespace` labels. + +--- + +## 3. Configure the Horizontal Pod Autoscaler + +Now that a clean, usable metric is available in Prometheus, you can configure the HPA. + +### 3.1. Deploy the Prometheus Adapter + +The Prometheus Adapter bridges Prometheus and the Kubernetes custom metrics API. It is configured to read the `gke_dcgm_fi_dev_gpu_util_relabelled` metric and expose it as `gpu_utilization_percent`. + +```bash +kubectl apply -f ./prometheus-adapter.yaml +``` +Verify that the adapter's pod is running in the `monitoring` namespace. + +### 3.2. Verify the Custom Metrics API + +After deploying the adapter, it's vital to verify that it is successfully exposing the transformed metrics to the Kubernetes API. You can do this by querying the custom metrics API directly. + +```bash +kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq . +``` + +The output should be a list of available custom metrics. Look for the `pods/gpu_utilization_percent` metric, which confirms that the entire pipeline is working correctly and the metric is ready for the HPA to consume. + +```json +{ + "kind": "APIResourceList", + "apiVersion": "v1", + "groupVersion": "custom.metrics.k8s.io/v1beta1", + "resources": [ + { + "name": "pods/gpu_utilization_percent", + "singularName": "", + "namespaced": true, + "kind": "MetricValueList", + "verbs": [ + "get" + ] + } + ] +} +``` + +### 3.3. Deploy the Horizontal Pod Autoscaler (HPA) + +The HPA is configured to use the final, clean metric name, `gpu_utilization_percent`, to maintain an average GPU utilization of 20%. + +```bash +kubectl apply -f ./gpu-horizontal-pod-autoscaler.yaml -n vllm-example +``` + +Inspect the HPA's configuration to confirm it's targeting the correct metric. +```bash +kubectl describe hpa/gemma-server-gpu-hpa -n vllm-example +# Expected output should include: +# Metrics: ( current / target ) +# "gpu_utilization_percent" on pods: / 20 +``` + +--- + +## 4. Load Test the Autoscaling Setup + +Generate a sustained load on the vLLM server to cause GPU utilization to rise. + +### 4.1. Generate Inference Load + +First, establish a port-forward to the vLLM service. +```bash +kubectl port-forward service/vllm-service -n vllm-example 8081:8081 +``` + +In another terminal, execute the `request-looper.sh` script. +```bash +./request-looper.sh +``` + +### 4.2. Observe the HPA Scaling the Deployment + +While the load script is running, monitor the HPA's behavior. +```bash +# See the HPA's metric values and scaling events +kubectl describe hpa/gemma-server-gpu-hpa -n vllm-example + +# Watch the number of deployment replicas increase +kubectl get deploy/vllm-gemma-deployment -n vllm-example -w +``` +As the average GPU utilization exceeds the 20% target, the HPA will scale up the deployment. + +--- + +## 5. Cleanup + +To tear down the resources from this exercise, run the following command: +```bash +kubectl delete -f . -n vllm-example +``` \ No newline at end of file diff --git a/AI/vllm-deployment/hpa/gpu-service-monitor.yaml b/AI/vllm-deployment/hpa/gpu-service-monitor.yaml new file mode 100644 index 000000000..d0818ff9e --- /dev/null +++ b/AI/vllm-deployment/hpa/gpu-service-monitor.yaml @@ -0,0 +1,34 @@ +# This ServiceMonitor tells the Prometheus Operator how to discover and scrape +# metrics from the NVIDIA DCGM Exporter. It is designed to find the +# 'gke-managed-dcgm-exporter' Service in the 'gke-managed-system' namespace +# and scrape its '/metrics' endpoint. +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: nvidia-dcgm-exporter-servicemonitor + namespace: monitoring + labels: + # This label is used by the Prometheus Operator to discover this + # ServiceMonitor. It must match the 'serviceMonitorSelector' configured + # in the Prometheus custom resource. + release: prometheus +spec: + # This selector identifies the specific Service to scrape. It must match + # the labels on the 'gke-managed-dcgm-exporter' Service. + selector: + matchLabels: + # GKE-SPECIFIC: This label matches the Service for GKE's managed DCGM + # exporter. If you are using a different DCGM deployment, you must + # update this label to match the label of the corresponding Service. + app.kubernetes.io/name: gke-managed-dcgm-exporter + # This selector specifies which namespace to search for the target Service. + # For GKE, the DCGM service is in 'gke-managed-system'. + namespaceSelector: + matchNames: + # GKE-SPECIFIC: This is the namespace for GKE's managed DCGM exporter. + # For other environments, this should be the namespace where you have + # deployed the DCGM exporter Service. + - gke-managed-system + endpoints: + - port: metrics + interval: 15s \ No newline at end of file diff --git a/AI/vllm-deployment/hpa/horizontal-pod-autoscaler.yaml b/AI/vllm-deployment/hpa/horizontal-pod-autoscaler.yaml new file mode 100644 index 000000000..a57968dd9 --- /dev/null +++ b/AI/vllm-deployment/hpa/horizontal-pod-autoscaler.yaml @@ -0,0 +1,48 @@ +# This HorizontalPodAutoscaler (HPA) targets the vLLM deployment and scales +# it based on the average number of concurrent requests across all pods. +# It uses the custom metric 'vllm_num_requests_running', which is provided +# by the Prometheus Adapter. + +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: gemma-server-hpa +spec: + # scaleTargetRef points the HPA to the deployment it needs to scale. + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: vllm-gemma-deployment + minReplicas: 1 + maxReplicas: 5 + metrics: + - type: Pods + pods: + metric: + # This is the custom metric that the HPA will query. + # IMPORTANT: This name ('vllm_num_requests_running') is not the raw metric + # from the vLLM server. It is the clean, renamed metric that is + # exposed by the Prometheus Adapter. The names must match exactly. + name: vllm_num_requests_running + target: + type: AverageValue + # This is the target value for the metric. The HPA will add or remove + # pods to keep the average number of running requests per pod at 4. + averageValue: 4 + behavior: + # The scaling behavior can be customized to control how quickly the + # deployment scales up or down. + scaleDown: + # The stabilizationWindowSeconds is set to 30 to prevent the HPA from + # scaling down too aggressively. This means the controller will wait for + # 30 seconds after a scale-down event before considering another one. + # This helps to smooth out the scaling behavior and prevent "flapping" + # (rapidly scaling up and down). A larger value will make the scaling + # more conservative, which can be useful for workloads with fluctuating + # metrics, but it may also result in higher costs if the resources are + # not released quickly after a load decrease. + stabilizationWindowSeconds: 30 + policies: + - type: Percent + value: 100 + periodSeconds: 15 \ No newline at end of file diff --git a/AI/vllm-deployment/hpa/prometheus-adapter.yaml b/AI/vllm-deployment/hpa/prometheus-adapter.yaml new file mode 100644 index 000000000..d1710c279 --- /dev/null +++ b/AI/vllm-deployment/hpa/prometheus-adapter.yaml @@ -0,0 +1,200 @@ +# This manifest deploys the Prometheus Adapter, which is responsible for +# reading metrics from Prometheus and exposing them to the Kubernetes +# Custom Metrics API. The Horizontal Pod Autoscaler (HPA) uses this API +# to query for the custom metrics that drive its scaling decisions. +# This file also includes the necessary RBAC permissions and the critical +# ConfigMap that defines the metric transformation rules. + + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-adapter + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus-adapter +rules: +- apiGroups: + - "" + resources: + - nodes + - namespaces + - pods + - services + verbs: + - get + - list + - watch +- apiGroups: + - extensions + resources: + - deployments + - replicasets + - ingresses + verbs: + - get + - list + - watch +- apiGroups: + - apps + resources: + - deployments + - replicasets + - statefulsets + verbs: + - get + - list + - watch +- apiGroups: + - custom.metrics.k8s.io + resources: + - '*' + verbs: + - '*' +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus-adapter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-adapter +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-adapter-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus-adapter-system-auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-adapter + namespace: monitoring +data: + config.yaml: | + # This 'rules' section is the core of the Prometheus Adapter's configuration. + # It defines how metrics from Prometheus are transformed and exposed to the + # Kubernetes Custom Metrics API. + rules: + # This rule renames the 'vllm:num_requests_running' metric to + # 'vllm_num_requests_running' to make it a valid custom metric name. + - seriesQuery: 'vllm:num_requests_running' + resources: + overrides: + namespace: {resource: "namespace"} + pod: {resource: "pod"} + name: + matches: "vllm:num_requests_running" + as: "vllm_num_requests_running" + metricsQuery: 'sum(vllm:num_requests_running{<<.LabelMatchers>>}) by (<<.GroupBy>>)' + # This rule targets the NEW metric created by the PrometheusRule. + # It takes 'gke_dcgm_fi_dev_gpu_util_relabelled' (which now has the correct + # pod and namespace labels) and renames it to the final, clean name + # 'gpu_utilization_percent' for the HPA to use. + - seriesQuery: 'gke_dcgm_fi_dev_gpu_util_relabelled' + resources: + overrides: + namespace: {resource: "namespace"} + pod: {resource: "pod"} + name: + matches: "gke_dcgm_fi_dev_gpu_util_relabelled" + as: "gpu_utilization_percent" + metricsQuery: 'sum(gke_dcgm_fi_dev_gpu_util_relabelled{<<.LabelMatchers>>}) by (<<.GroupBy>>)' +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-adapter + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus-adapter + template: + metadata: + labels: + app: prometheus-adapter + spec: + serviceAccountName: prometheus-adapter + containers: + - name: prometheus-adapter + image: registry.k8s.io/prometheus-adapter/prometheus-adapter:v0.11.2 + args: + - --prometheus-url=http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090/ + - --metrics-relist-interval=1m + - --config=/etc/adapter/config.yaml + - --secure-port=6443 + - --cert-dir=/tmp/cert + - --v=4 # Increase log verbosity for debugging + ports: + - containerPort: 6443 + name: https + volumeMounts: + - name: config + mountPath: /etc/adapter + readOnly: true + - name: certs + mountPath: /tmp/cert + volumes: + - name: config + configMap: + name: prometheus-adapter + - name: certs + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-adapter + namespace: monitoring +spec: + ports: + - port: 443 + targetPort: https + selector: + app: prometheus-adapter +--- +apiVersion: apiregistration.k8s.io/v1 +kind: APIService +metadata: + name: v1beta1.custom.metrics.k8s.io +spec: + service: + name: prometheus-adapter + namespace: monitoring + group: custom.metrics.k8s.io + version: v1beta1 + insecureSkipTLSVerify: true + groupPriorityMinimum: 100 + versionPriority: 100 \ No newline at end of file diff --git a/AI/vllm-deployment/hpa/prometheus-rule.yaml b/AI/vllm-deployment/hpa/prometheus-rule.yaml new file mode 100644 index 000000000..7ad57dbcb --- /dev/null +++ b/AI/vllm-deployment/hpa/prometheus-rule.yaml @@ -0,0 +1,41 @@ +# This PrometheusRule defines a recording rule that is essential for making +# the raw DCGM GPU metrics usable by the HPA. The raw 'DCGM_FI_DEV_GPU_UTIL' +# metric scraped by Prometheus does not have the standard 'pod' and 'namespace' +# labels that the Prometheus Adapter needs to associate the metric with a +# specific workload pod. +# +# This rule creates a NEW metric, 'gke_dcgm_fi_dev_gpu_util_relabelled', +# and uses the 'label_replace' function to copy the pod and namespace +# information from the 'exported_pod' and 'exported_namespace' labels into +# the standard 'pod' and 'namespace' labels. The Prometheus Adapter will then +# use this new, correctly-labelled metric. + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: dcgm-relabel-rules + namespace: monitoring + labels: + # This label ensures the Prometheus instance discovers this rule. + release: prometheus +spec: + groups: + - name: dcgm.rules + rules: + # 'record' specifies the name of the new metric to be created. + - record: gke_dcgm_fi_dev_gpu_util_relabelled + # 'expr' contains the PromQL expression that generates the new metric. + expr: | + label_replace( + label_replace( + DCGM_FI_DEV_GPU_UTIL, + "pod", + "$1", + "exported_pod", + "(.+)" + ), + "namespace", + "$1", + "exported_namespace", + "(.+)" + ) \ No newline at end of file diff --git a/AI/vllm-deployment/hpa/request-looper.sh b/AI/vllm-deployment/hpa/request-looper.sh new file mode 100755 index 000000000..249e06b3e --- /dev/null +++ b/AI/vllm-deployment/hpa/request-looper.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# This script sends a POST request to a local LLM endpoint every second +# without waiting for a response. It runs until manually stopped by pressing Ctrl+C. +# +# Usage: ./request-looper.sh [PORT] [MODEL_NAME] ["MESSAGE CONTENT"] +# +# Examples: +# ./request-looper.sh +# ./request-looper.sh 8082 +# ./request-looper.sh 8082 "google/gemma-2b" "What is the capital of France?" + +# --- Configuration (with defaults) --- +# Use command-line arguments if provided, otherwise use defaults. +PORT=${1:-"8081"} +MODEL=${2:-"google/gemma-3-1b-it"} +CONTENT=${3:-"Explain Quantum Computing in simple terms."} + +# The URL of the LLM API endpoint. +URL="http://localhost:${PORT}/v1/chat/completions" + +# The JSON payload for the request. +JSON_PAYLOAD=$(printf '{ + "model": "%s", + "messages": [{"role": "user", "content": "%s"}] +}' "$MODEL" "$CONTENT") + +# --- Script Logic --- +echo "Starting request loop..." +echo " PORT: $PORT" +echo " MODEL: $MODEL" +echo " CONTENT: $CONTENT" +echo "Press Ctrl+C to stop." + +# Infinite loop to send requests. +while true +do + echo "----------------------------------------" + echo "Sending request at $(date)" + + # Send the POST request using curl and run it in the background (&). + # The output and errors are redirected to /dev/null to keep the console clean. + curl -X POST "$URL" \ + -H "Content-Type: application/json" \ + -d "$JSON_PAYLOAD" \ + --silent \ + -o /dev/null \ + -w "HTTP Status: %{http_code}\n" & + + # Wait for 1 second before the next request. + sleep 1 +done diff --git a/AI/vllm-deployment/hpa/vllm-hpa.md b/AI/vllm-deployment/hpa/vllm-hpa.md new file mode 100644 index 000000000..5993f5e56 --- /dev/null +++ b/AI/vllm-deployment/hpa/vllm-hpa.md @@ -0,0 +1,180 @@ +# Autoscaling an AI Inference Server with HPA using vLLM Server Metrics + +This guide provides a comprehensive walkthrough for configuring a Kubernetes Horizontal Pod Autoscaler (HPA) to dynamically scale a vLLM AI inference server. The autoscaling logic is driven by a custom metric, `vllm:num_requests_running`, which is exposed directly by the vLLM server. This approach allows the system to scale based on the actual workload (i.e., the number of concurrent inference requests) rather than generic CPU or memory metrics. + +This guide assumes you have already deployed the vLLM inference server from the [parent directory's exercise](../README.md) into the `vllm-example` namespace. + +--- + +## 1. Verify vLLM Server Metrics + +Before configuring autoscaling, it's crucial to confirm that the metric source is functioning correctly. This involves ensuring that the vLLM server's `/metrics` endpoint is reachable and is exposing the `vllm:num_requests_running` metric. + +### 1.1. Access the Metrics Endpoint + +To access the vLLM service from your local machine, use `kubectl port-forward`. This command creates a secure tunnel to the service within the cluster. + +```bash +kubectl port-forward service/vllm-service -n vllm-example 8081:8081 +``` + +With the port forward active, open a new terminal and use `curl` to query the `/metrics` endpoint. Filter the output for the target metric to confirm its presence. + +```bash +curl -sS http://localhost:8081/metrics | grep num_requests_ +``` + +The expected output should include the metric name and its current value, which will likely be `0.0` on an idle server: +``` +# HELP vllm:num_requests_running Number of requests currently running on GPU. +# TYPE vllm:num_requests_running gauge +vllm:num_requests_running{model_name="google/gemma-3-1b-it"} 0.0 +``` +Once you have verified that the metric is being exposed, you can stop the `port-forward` process. + +--- + +## 2. Set Up Prometheus for Metric Collection + +With the metric source confirmed, the next step is to collect these metrics using Prometheus. This is achieved by installing the Prometheus Operator, which simplifies the management and discovery of monitoring targets in Kubernetes. + +### 2.1. Install the Prometheus Operator + +The Prometheus Operator can be easily installed using its official Helm chart. This will deploy a full monitoring stack into the `monitoring` namespace. If you have already installed it in the GPU metrics exercise, you can skip this step. + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts/ +helm repo update +helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace +``` + +You can verify the installation by listing the pods in the `monitoring` namespace. +```bash +kubectl get pods --namespace monitoring +``` + +### 2.2. Configure Metric Scraping with a `ServiceMonitor` + +The `ServiceMonitor` is a custom resource provided by the Prometheus Operator that declaratively defines how a set of services should be monitored. The manifest below creates a `ServiceMonitor` that targets the vLLM service and instructs Prometheus to scrape its `/metrics` endpoint. + +```bash +kubectl apply -f ./vllm-service-monitor.yaml +``` + +### 2.3. Verify Metric Collection in Prometheus + +To ensure that Prometheus has discovered the target and is successfully scraping the metrics, you can query the Prometheus API. First, establish a port-forward to the Prometheus service. + +```bash +kubectl port-forward svc/prometheus-kube-prometheus-prometheus 9090:9090 -n monitoring +``` + +In a separate terminal, use `curl` and `jq` to inspect the active targets. The following commands will confirm that the `vllm-gemma-servicemonitor` is registered and healthy. + +```bash +# Verify that the scrape pool for the ServiceMonitor exists +curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[].scrapePool' | grep "vllm-gemma-servicemonitor" + +# Verify that the target's health is "up" +curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | select(.scrapePool | contains("vllm-gemma-servicemonitor"))' | jq '.health' +``` +A successful verification will return `"up"`, indicating that Prometheus is correctly collecting the vLLM server metrics. + +--- + +## 3. Configure the Horizontal Pod Autoscaler + +Now that the metrics are being collected, you can configure the HPA to use them. This requires two components: the Prometheus Adapter, which makes the Prometheus metrics available to Kubernetes, and the HPA resource itself. + +### 3.1. Deploy the Prometheus Adapter + +The Prometheus Adapter acts as a bridge between Prometheus and the Kubernetes custom metrics API. It queries Prometheus for the specified metrics and exposes them in a format that the HPA controller can understand. + +A critical function of the adapter in this setup is to rename the raw metric from the vLLM server. The raw metric, `vllm:num_requests_running`, contains a colon, which is not a valid character for a custom metric name in Kubernetes. The `prometheus-adapter.yaml` file contains a rule that transforms this metric: + +```yaml +# Excerpt from prometheus-adapter.yaml's ConfigMap +... +- seriesQuery: 'vllm:num_requests_running' + name: + as: "vllm_num_requests_running" +... +``` +This rule finds the raw metric and exposes it to the Kubernetes custom metrics API as `vllm_num_requests_running`, replacing the colon with an underscore. + +Deploy the adapter: +```bash +kubectl apply -f ./prometheus-adapter.yaml +``` +Verify that the adapter's pod is running in the `monitoring` namespace. + +### 3.2. Verify the Custom Metrics API + +After deploying the adapter, it's vital to verify that it is successfully exposing the transformed metrics to the Kubernetes API. You can do this by querying the custom metrics API directly. + +```bash +kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq . +``` +The output should be a list of available custom metrics. Look for the `pods/vllm_num_requests_running` metric, which confirms that the metric is ready for the HPA to consume. + +### 3.3. Deploy the Horizontal Pod Autoscaler (HPA) + +The HPA resource defines the scaling behavior. The manifest below is configured to use the clean metric name, `vllm_num_requests_running`, exposed by the Prometheus Adapter. It will scale the `vllm-gemma-deployment` up or down to maintain an average of 4 concurrent requests per pod. + +```bash +kubectl apply -f ./horizontal-pod-autoscaler.yaml -n vllm-example +``` + +You can inspect the HPA's configuration and status with the `describe` command. Note that the `Metrics` section now shows our clean metric name. +```bash +kubectl describe hpa/gemma-server-hpa -n vllm-example +# Expected output should include: +# Name: gemma-server-hpa +# Namespace: vllm-example +# ... +# Metrics: ( current / target ) +# "vllm_num_requests_running" on pods: / 4 +# Min replicas: 1 +# Max replicas: 5 +``` + +--- + +## 4. Load Test the Autoscaling Setup + +To observe the HPA in action, you need to generate a sustained load on the vLLM server, causing the `vllm_num_requests_running` metric to rise above the target value. + +### 4.1. Generate Inference Load + +First, re-establish the port-forward to the vLLM service. +```bash +kubectl port-forward service/vllm-service -n vllm-example 8081:8081 +``` + +In another terminal, execute the `request-looper.sh` script. This will send a continuous stream of inference requests to the server. +```bash +./request-looper.sh +``` + +### 4.2. Observe the HPA Scaling the Deployment + +While the load script is running, you can monitor the HPA's behavior and the deployment's replica count in real-time. + +```bash +# See the HPA's metric values and scaling events +kubectl describe hpa/gemma-server-hpa -n vllm-example + +# Watch the number of deployment replicas increase +kubectl get deploy/vllm-gemma-deployment -n vllm-example -w +``` +As the average number of running requests per pod exceeds the target of 4, the HPA will begin to scale up the deployment, and you will see new pods being created. + +--- + +## 5. Cleanup + +To tear down the resources created during this exercise, you can use `kubectl delete` with the `-f` flag, which will delete all resources defined in the manifests in the current directory. + +```bash +kubectl delete -f . -n vllm-example +``` \ No newline at end of file diff --git a/AI/vllm-deployment/hpa/vllm-service-monitor.yaml b/AI/vllm-deployment/hpa/vllm-service-monitor.yaml new file mode 100644 index 000000000..0ddbdcf8a --- /dev/null +++ b/AI/vllm-deployment/hpa/vllm-service-monitor.yaml @@ -0,0 +1,33 @@ +# This ServiceMonitor tells the Prometheus Operator how to discover and scrape +# metrics from the vLLM inference server. It is designed to find the +# 'vllm-service' in the 'vllm-example' namespace and scrape its '/metrics' endpoint. + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: vllm-gemma-servicemonitor + # This ServiceMonitor must be deployed in the same namespace as the + # Prometheus Operator, which is 'monitoring' in this setup. + namespace: monitoring + labels: + # This label is used by the Prometheus Operator to discover this + # ServiceMonitor. It must match the 'serviceMonitorSelector' configured + # in the Prometheus custom resource. + release: prometheus +spec: + # This selector specifies which namespace(s) to search for the target Service. + # In this case, it's looking in the 'vllm-example' namespace where the vLLM + # service is deployed. + namespaceSelector: + matchNames: + - vllm-example + # This selector identifies the specific Service to scrape within the + # selected namespace(s). It must match the labels on the 'vllm-service'. + selector: + matchLabels: + app: gemma-server + endpoints: + # This section defines the port and path for the metrics endpoint. + - port: http + path: /metrics + interval: 15s \ No newline at end of file