@@ -7,12 +7,12 @@ huggingface:
7
7
# Use a yaml anchor to avoid duplication elsewhere
8
8
model : &model-name ise-uiuc/Magicoder-S-DS-6.7B
9
9
10
- # For private/gated huggingface models (e.g. Meta's Llama models)
10
+ # For private/gated huggingface models (e.g. Meta's Llama models)
11
11
# you must provide your own huggingface token, for details see:
12
12
# https://huggingface.co/docs/hub/security-tokens
13
-
13
+
14
14
# To do this, either provide the name of an existing secret on the cluster,
15
- # which should be created before installing this chart by running
15
+ # which should be created before installing this chart by running
16
16
# `kubectl create secret generic huggingface-token --from-env-file <file-name>`
17
17
# where <file-name> is a file with the following contents:
18
18
# HUGGING_FACE_HUB_TOKEN=<token-value>
27
27
image :
28
28
repository : vllm/vllm-openai
29
29
version : v0.2.7
30
- # Service config
30
+ # Service config
31
31
service :
32
32
name : llm-backend
33
33
type : ClusterIP
45
45
path : /tmp/llm/huggingface-cache
46
46
# Number of gpus to requests for each api pod instance
47
47
# NOTE: This must be in the range 1 <= value <= N, where
48
- # 'N' is the number of GPUs available in a single
48
+ # 'N' is the number of GPUs available in a single
49
49
# worker node on the target Kubernetes cluster.
50
50
# NOTE: According to the vLLM docs found here
51
51
# https://docs.vllm.ai/en/latest/serving/distributed_serving.html
52
- # distributed / multi-GPU support should be available, though it
52
+ # distributed / multi-GPU support should be available, though it
53
53
# has not been tested against this app.
54
54
gpus : 1
55
55
# The update strategy to use for the deployment
56
56
# See https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#updating-a-deployment
57
57
# NOTE: Changing this has implications for the number of additional GPU worker nodes required
58
58
# to preform a rolling zero-downtime update
59
59
updateStrategy :
60
- rollingUpdate :
61
- maxSurge : 0%
62
- maxUnavailable : 100%
60
+ type : recreate
63
61
# Extra args to supply to the vLLM backend, see
64
62
# https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py
65
63
extraArgs : []
66
-
64
+
67
65
# Configuration for the frontend web interface
68
66
ui :
69
67
# The file from the UI config map to execute as the entrypoint to the frontend app
77
75
image :
78
76
repository : ghcr.io/stackhpc/azimuth-llm-ui-base
79
77
version : " 984c499"
80
- # Service config
78
+ # Service config
81
79
service :
82
80
name : web-app
83
81
type : ClusterIP
0 commit comments