|
| 1 | +# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s-labs/helm-charts/refs/tags/common-3.1.0/charts/other/app-template/schemas/helmrelease-helm-v2beta2.schema.json |
| 2 | +--- |
| 3 | +apiVersion: helm.toolkit.fluxcd.io/v2 |
| 4 | +kind: HelmRelease |
| 5 | +metadata: |
| 6 | + name: ${APP} |
| 7 | +spec: |
| 8 | + interval: 12h |
| 9 | + chart: |
| 10 | + spec: |
| 11 | + chart: examples/online_serving/chart-helm |
| 12 | + version: 0.0.1 |
| 13 | + sourceRef: |
| 14 | + kind: GitRepository |
| 15 | + name: vllm-charts |
| 16 | + namespace: flux-system |
| 17 | + maxHistory: 2 |
| 18 | + install: |
| 19 | + remediation: |
| 20 | + retries: 3 |
| 21 | + upgrade: |
| 22 | + cleanupOnFail: true |
| 23 | + remediation: |
| 24 | + retries: 3 |
| 25 | + uninstall: |
| 26 | + keepHistory: false |
| 27 | + values: |
| 28 | + image: |
| 29 | + # -- Image repository |
| 30 | + repository: "vllm/vllm-openai" |
| 31 | + # -- Image tag |
| 32 | + tag: "latest" |
| 33 | + # -- Container launch command |
| 34 | + command: |
| 35 | + [ |
| 36 | + "vllm", |
| 37 | + "serve", |
| 38 | + "/data/", |
| 39 | + "--served-model-name", |
| 40 | + "opt-125m", |
| 41 | + "--enforce-eager", |
| 42 | + "--dtype", |
| 43 | + "bfloat16", |
| 44 | + "--block-size", |
| 45 | + "16", |
| 46 | + "--host", |
| 47 | + "0.0.0.0", |
| 48 | + "--port", |
| 49 | + "8000", |
| 50 | + ] |
| 51 | + containerPort: 8000 |
| 52 | + serviceName: vllm |
| 53 | + servicePort: 80 |
| 54 | + extraPorts: [] |
| 55 | + replicaCount: 1 |
| 56 | + deploymentStrategy: {} |
| 57 | + resources: |
| 58 | + requests: |
| 59 | + cpu: 4 |
| 60 | + memory: 24Gi |
| 61 | + nvidia.com/gpu: 1 |
| 62 | + limits: |
| 63 | + memory: 24Gi |
| 64 | + nvidia.com/gpu: 1 |
| 65 | + |
| 66 | + # -- Type of gpu used |
| 67 | + gpuModels: {} |
| 68 | + # - "TYPE_GPU_USED" |
| 69 | + autoscaling: |
| 70 | + enabled: false |
| 71 | + |
| 72 | + # -- Configmap |
| 73 | + configs: {} |
| 74 | + |
| 75 | + # -- Secrets configuration |
| 76 | + secrets: {} |
| 77 | + |
| 78 | + # -- External configuration |
| 79 | + externalConfigs: [] |
| 80 | + |
| 81 | + # -- Custom Objects configuration |
| 82 | + customObjects: [] |
| 83 | + |
| 84 | + # -- Disruption Budget Configuration |
| 85 | + maxUnavailablePodDisruptionBudget: "" |
| 86 | + |
| 87 | + # -- Additional configuration for the init container |
| 88 | + extraInit: |
| 89 | + # -- Model download functionality (optional) |
| 90 | + modelDownload: |
| 91 | + # -- Enable model download job and wait container |
| 92 | + enabled: false |
| 93 | + # -- Image configuration for model download operations |
| 94 | + image: |
| 95 | + # -- Image repository |
| 96 | + repository: "amazon/aws-cli" |
| 97 | + # -- Image tag |
| 98 | + tag: "2.6.4" |
| 99 | + # -- Image pull policy |
| 100 | + pullPolicy: "IfNotPresent" |
| 101 | + # -- Wait container configuration (init container that waits for model to be ready) |
| 102 | + waitContainer: |
| 103 | + # -- Command to execute |
| 104 | + command: ["/bin/bash"] |
| 105 | + # -- Arguments for the wait container |
| 106 | + args: |
| 107 | + - "-eucx" |
| 108 | + - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done" |
| 109 | + # -- Environment variables (optional, overrides S3 defaults entirely if specified) |
| 110 | + # env: |
| 111 | + # - name: HUGGING_FACE_HUB_TOKEN |
| 112 | + # value: "your-token" |
| 113 | + # - name: MODEL_ID |
| 114 | + # value: "meta-llama/Llama-2-7b" |
| 115 | + # -- Download job configuration (job that actually downloads the model) |
| 116 | + downloadJob: |
| 117 | + # -- Command to execute |
| 118 | + command: ["/bin/bash"] |
| 119 | + # -- Arguments for the download job |
| 120 | + args: |
| 121 | + - "-eucx" |
| 122 | + - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data" |
| 123 | + # -- Environment variables (optional, overrides S3 defaults entirely if specified) |
| 124 | + # env: |
| 125 | + # - name: HUGGING_FACE_HUB_TOKEN |
| 126 | + # value: "your-token" |
| 127 | + # - name: MODEL_ID |
| 128 | + # value: "meta-llama/Llama-2-7b" |
| 129 | + |
| 130 | + # -- Custom init containers (appended after wait-download-model if modelDownload is enabled) |
| 131 | + initContainers: [] |
| 132 | + # Example for llm-d sidecar: |
| 133 | + # initContainers: |
| 134 | + # - name: llm-d-routing-proxy |
| 135 | + # image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0 |
| 136 | + # imagePullPolicy: IfNotPresent |
| 137 | + # ports: |
| 138 | + # - containerPort: 8080 |
| 139 | + # name: proxy |
| 140 | + # securityContext: |
| 141 | + # runAsUser: 1000 |
| 142 | + |
| 143 | + # -- Path of the model on the s3 which hosts model weights and config files |
| 144 | + s3modelpath: "relative_s3_model_path/opt-125m" |
| 145 | + # -- Storage size for the PVC |
| 146 | + pvcStorage: "1Gi" |
| 147 | + # -- Disable AWS EC2 metadata service |
| 148 | + awsEc2MetadataDisabled: true |
| 149 | + |
| 150 | + # -- Additional containers configuration |
| 151 | + extraContainers: [] |
| 152 | + |
| 153 | + # -- Readiness probe configuration |
| 154 | + readinessProbe: |
| 155 | + # -- Number of seconds after the container has started before readiness probe is initiated |
| 156 | + initialDelaySeconds: 5 |
| 157 | + # -- How often (in seconds) to perform the readiness probe |
| 158 | + periodSeconds: 5 |
| 159 | + # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
| 160 | + failureThreshold: |
| 161 | + 3 |
| 162 | + # -- Configuration of the Kubelet http request on the server |
| 163 | + httpGet: |
| 164 | + # -- Path to access on the HTTP server |
| 165 | + path: /health |
| 166 | + # -- Name or number of the port to access on the container, on which the server is listening |
| 167 | + port: 8000 |
| 168 | + |
| 169 | + # -- Liveness probe configuration |
| 170 | + livenessProbe: |
| 171 | + # -- Number of seconds after the container has started before liveness probe is initiated |
| 172 | + initialDelaySeconds: 15 |
| 173 | + # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
| 174 | + failureThreshold: 3 |
| 175 | + # -- How often (in seconds) to perform the liveness probe |
| 176 | + periodSeconds: 10 |
| 177 | + # -- Configuration of the Kubelet http request on the server |
| 178 | + httpGet: |
| 179 | + # -- Path to access on the HTTP server |
| 180 | + path: /health |
| 181 | + # -- Name or number of the port to access on the container, on which the server is listening |
| 182 | + port: 8000 |
| 183 | + |
| 184 | + labels: |
| 185 | + environment: "test" |
| 186 | + vllm-release: "v0.14.1" |
0 commit comments