Skip to content

Commit a151463

Browse files
committed
chore(llm): try vllm
1 parent 65c57ce commit a151463

File tree

8 files changed

+216
-8
lines changed

8 files changed

+216
-8
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
---
2+
apiVersion: source.toolkit.fluxcd.io/v1
3+
kind: GitRepository
4+
metadata:
5+
name: vllm-charts
6+
namespace: flux-system
7+
spec:
8+
interval: 1h0m0s
9+
ignore: |
10+
# exclude all
11+
/*
12+
# include chart dir
13+
!/examples/online_serving/chart-helm
14+
url: https://github.com/vllm-project/vllm
15+
ref:
16+
tag: v0.14.1
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s-labs/helm-charts/refs/tags/common-3.1.0/charts/other/app-template/schemas/helmrelease-helm-v2beta2.schema.json
2+
---
3+
apiVersion: helm.toolkit.fluxcd.io/v2
4+
kind: HelmRelease
5+
metadata:
6+
name: ${APP}
7+
spec:
8+
interval: 12h
9+
chart:
10+
spec:
11+
chart: examples/online_serving/chart-helm
12+
version: 0.0.1
13+
sourceRef:
14+
kind: GitRepository
15+
name: vllm-charts
16+
namespace: flux-system
17+
maxHistory: 2
18+
install:
19+
remediation:
20+
retries: 3
21+
upgrade:
22+
cleanupOnFail: true
23+
remediation:
24+
retries: 3
25+
uninstall:
26+
keepHistory: false
27+
values:
28+
image:
29+
# -- Image repository
30+
repository: "vllm/vllm-openai"
31+
# -- Image tag
32+
tag: "latest"
33+
# -- Container launch command
34+
command:
35+
[
36+
"vllm",
37+
"serve",
38+
"/data/",
39+
"--served-model-name",
40+
"opt-125m",
41+
"--enforce-eager",
42+
"--dtype",
43+
"bfloat16",
44+
"--block-size",
45+
"16",
46+
"--host",
47+
"0.0.0.0",
48+
"--port",
49+
"8000",
50+
]
51+
containerPort: 8000
52+
serviceName: vllm
53+
servicePort: 80
54+
extraPorts: []
55+
replicaCount: 1
56+
deploymentStrategy: {}
57+
resources:
58+
requests:
59+
cpu: 4
60+
memory: 24Gi
61+
nvidia.com/gpu: 1
62+
limits:
63+
memory: 24Gi
64+
nvidia.com/gpu: 1
65+
66+
# -- Type of gpu used
67+
gpuModels: {}
68+
# - "TYPE_GPU_USED"
69+
autoscaling:
70+
enabled: false
71+
72+
# -- Configmap
73+
configs: {}
74+
75+
# -- Secrets configuration
76+
secrets: {}
77+
78+
# -- External configuration
79+
externalConfigs: []
80+
81+
# -- Custom Objects configuration
82+
customObjects: []
83+
84+
# -- Disruption Budget Configuration
85+
maxUnavailablePodDisruptionBudget: ""
86+
87+
# -- Additional configuration for the init container
88+
extraInit:
89+
# -- Model download functionality (optional)
90+
modelDownload:
91+
# -- Enable model download job and wait container
92+
enabled: false
93+
# -- Image configuration for model download operations
94+
image:
95+
# -- Image repository
96+
repository: "amazon/aws-cli"
97+
# -- Image tag
98+
tag: "2.6.4"
99+
# -- Image pull policy
100+
pullPolicy: "IfNotPresent"
101+
# -- Wait container configuration (init container that waits for model to be ready)
102+
waitContainer:
103+
# -- Command to execute
104+
command: ["/bin/bash"]
105+
# -- Arguments for the wait container
106+
args:
107+
- "-eucx"
108+
- "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
109+
# -- Environment variables (optional, overrides S3 defaults entirely if specified)
110+
# env:
111+
# - name: HUGGING_FACE_HUB_TOKEN
112+
# value: "your-token"
113+
# - name: MODEL_ID
114+
# value: "meta-llama/Llama-2-7b"
115+
# -- Download job configuration (job that actually downloads the model)
116+
downloadJob:
117+
# -- Command to execute
118+
command: ["/bin/bash"]
119+
# -- Arguments for the download job
120+
args:
121+
- "-eucx"
122+
- "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
123+
# -- Environment variables (optional, overrides S3 defaults entirely if specified)
124+
# env:
125+
# - name: HUGGING_FACE_HUB_TOKEN
126+
# value: "your-token"
127+
# - name: MODEL_ID
128+
# value: "meta-llama/Llama-2-7b"
129+
130+
# -- Custom init containers (appended after wait-download-model if modelDownload is enabled)
131+
initContainers: []
132+
# Example for llm-d sidecar:
133+
# initContainers:
134+
# - name: llm-d-routing-proxy
135+
# image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
136+
# imagePullPolicy: IfNotPresent
137+
# ports:
138+
# - containerPort: 8080
139+
# name: proxy
140+
# securityContext:
141+
# runAsUser: 1000
142+
143+
# -- Path of the model on the s3 which hosts model weights and config files
144+
s3modelpath: "relative_s3_model_path/opt-125m"
145+
# -- Storage size for the PVC
146+
pvcStorage: "1Gi"
147+
# -- Disable AWS EC2 metadata service
148+
awsEc2MetadataDisabled: true
149+
150+
# -- Additional containers configuration
151+
extraContainers: []
152+
153+
# -- Readiness probe configuration
154+
readinessProbe:
155+
# -- Number of seconds after the container has started before readiness probe is initiated
156+
initialDelaySeconds: 5
157+
# -- How often (in seconds) to perform the readiness probe
158+
periodSeconds: 5
159+
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
160+
failureThreshold:
161+
3
162+
# -- Configuration of the Kubelet http request on the server
163+
httpGet:
164+
# -- Path to access on the HTTP server
165+
path: /health
166+
# -- Name or number of the port to access on the container, on which the server is listening
167+
port: 8000
168+
169+
# -- Liveness probe configuration
170+
livenessProbe:
171+
# -- Number of seconds after the container has started before liveness probe is initiated
172+
initialDelaySeconds: 15
173+
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
174+
failureThreshold: 3
175+
# -- How often (in seconds) to perform the liveness probe
176+
periodSeconds: 10
177+
# -- Configuration of the Kubelet http request on the server
178+
httpGet:
179+
# -- Path to access on the HTTP server
180+
path: /health
181+
# -- Name or number of the port to access on the container, on which the server is listening
182+
port: 8000
183+
184+
labels:
185+
environment: "test"
186+
vllm-release: "v0.14.1"

clusters/apps/env/production/home/llm/app/kustomization.yaml

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,5 @@
22
apiVersion: kustomize.config.k8s.io/v1beta1
33
kind: Kustomization
44
resources:
5-
- app-template.yaml
6-
configMapGenerator:
7-
- name: llama-configmap
8-
files:
9-
- config.yaml=./config/llama-config.yaml
10-
generatorOptions:
11-
disableNameSuffixHash: true
5+
- gitrepo.yaml
6+
- hr.yaml

clusters/apps/env/production/home/llm/app/app-template.yaml renamed to clusters/apps/env/production/home/llm/app/llama-proxy/app-template.yaml

File renamed without changes.

clusters/apps/env/production/home/llm/app/config/llama-config.yaml renamed to clusters/apps/env/production/home/llm/app/llama-proxy/config/llama-config.yaml

File renamed without changes.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
apiVersion: kustomize.config.k8s.io/v1beta1
3+
kind: Kustomization
4+
resources:
5+
- app-template.yaml
6+
configMapGenerator:
7+
- name: llama-configmap
8+
files:
9+
- config.yaml=./config/llama-config.yaml
10+
generatorOptions:
11+
disableNameSuffixHash: true

clusters/apps/env/production/home/llm/app/old.yaml renamed to clusters/apps/env/production/home/llm/app/ollama/old.yaml

File renamed without changes.

clusters/apps/env/production/home/llm/ks.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
apiVersion: kustomize.toolkit.fluxcd.io/v1
44
kind: Kustomization
55
metadata:
6-
name: &app ollama
6+
name: &app vllm
77
namespace: flux-system
88
spec:
99
targetNamespace: home

0 commit comments

Comments
 (0)