Added comments in HPAs for scale up/down behavior

seans3 · seans3 · commit e881087c5194 · 2025-09-08T19:53:06.000Z
diff --git a/ai/vllm-deployment/hpa/gpu-horizontal-pod-autoscaler.yaml b/ai/vllm-deployment/hpa/gpu-horizontal-pod-autoscaler.yaml
@@ -32,6 +32,14 @@ spec:
         averageValue: 20
   behavior:
     scaleUp:
+      # The stabilizationWindowSeconds is set to 0 to allow for immediate
+      # scaling up. This is a trade-off:
+      # - For highly volatile workloads, immediate scaling is critical to
+      #   maintain performance and responsiveness.
+      # - However, this also introduces a risk of over-scaling if the workload
+      #   spikes are very brief. A non-zero value would make the scaling
+      #   less sensitive to short-lived spikes, but could introduce latency
+      #   if the load persists.
       stabilizationWindowSeconds: 0
       policies:
       - type: Pods
@@ -42,6 +50,14 @@ spec:
         periodSeconds: 15
       selectPolicy: Max
     scaleDown:
+      # The stabilizationWindowSeconds is set to 30 to prevent the HPA from
+      # scaling down too aggressively. This means the controller will wait for
+      # 30 seconds after a scale-down event before considering another one.
+      # This helps to smooth out the scaling behavior and prevent "flapping"
+      # (rapidly scaling up and down). A larger value will make the scaling
+      # more conservative, which can be useful for workloads with fluctuating
+      # metrics, but it may also result in higher costs if the resources are
+      # not released quickly after a load decrease.
       stabilizationWindowSeconds: 30
       policies:
       - type: Percent
diff --git a/ai/vllm-deployment/hpa/horizontal-pod-autoscaler.yaml b/ai/vllm-deployment/hpa/horizontal-pod-autoscaler.yaml
@@ -34,6 +34,14 @@ spec:
     # The scaling behavior can be customized to control how quickly the
     # deployment scales up or down.
     scaleDown:
+      # The stabilizationWindowSeconds is set to 30 to prevent the HPA from
+      # scaling down too aggressively. This means the controller will wait for
+      # 30 seconds after a scale-down event before considering another one.
+      # This helps to smooth out the scaling behavior and prevent "flapping"
+      # (rapidly scaling up and down). A larger value will make the scaling
+      # more conservative, which can be useful for workloads with fluctuating
+      # metrics, but it may also result in higher costs if the resources are
+      # not released quickly after a load decrease.
       stabilizationWindowSeconds: 30
       policies:
       - type: Percent