File tree Expand file tree Collapse file tree 5 files changed +48
-2
lines changed Expand file tree Collapse file tree 5 files changed +48
-2
lines changed Original file line number Diff line number Diff line change 1
1
# Toggles whether UI should be run locally using gradio hot-reloading
2
2
# or should be included in the remote Helm install
3
- run_ui_locally = True
3
+ run_ui_locally = os . getenv ( "AZIMUTH_LLM_TILT_LOCAL_UI" , True )
4
4
5
5
# Allow non-local contexts
6
6
allow_k8s_contexts (k8s_context ())
Original file line number Diff line number Diff line change @@ -10,10 +10,25 @@ controls:
10
10
type : MirrorControl
11
11
path : /huggingface/model
12
12
visuallyHidden : true
13
+ # Azimuth UI doesn't handle json type ["integer","null"]
14
+ # properly so we allow any type in JSON schema then
15
+ # constrain to (optional) integer here.
16
+ /api/modelMaxContextLength :
17
+ type : IntegerControl
18
+ minimum : 100
19
+ step : 100
20
+ required : false
13
21
14
22
sortOrder :
15
23
- /huggingface/model
16
24
- /huggingface/token
17
25
- /ui/appSettings/hf_model_instruction
18
26
- /ui/appSettings/page_title
27
+ - /api/image/version
19
28
- /ui/appSettings/llm_temperature
29
+ - /ui/appSettings/llm_max_tokens
30
+ - /ui/appSettings/llm_frequency_penalty
31
+ - /ui/appSettings/llm_presence_penalty
32
+ - /ui/appSettings/llm_top_p
33
+ - /ui/appSettings/llm_top_k
34
+ - /api/modelMaxContextLength
Original file line number Diff line number Diff line change 29
29
- --model
30
30
- {{ .Values.huggingface.model }}
31
31
{{- include "azimuth-llm.chatTemplate" . | nindent 10 }}
32
+ {{- if .Values.api.modelMaxContextLength -}}
33
+ - --max-model-len
34
+ - {{ .Values.api.modelMaxContextLength | quote }}
35
+ {{- end -}}
32
36
{{- if .Values.api.extraArgs -}}
33
37
{{- .Values.api.extraArgs | toYaml | nindent 10 }}
34
38
{{- end -}}
Original file line number Diff line number Diff line change 92
92
"required" : [" hf_model_name" , " hf_model_instruction" ]
93
93
}
94
94
}
95
+ },
96
+ "api" : {
97
+ "type" : " object" ,
98
+ "properties" : {
99
+ "modelMaxContextLength" : {
100
+ "title" : " Model Context Length" ,
101
+ "description" : " An override for the maximum context length to allow, if the model's default is not suitable."
102
+ },
103
+ "image" : {
104
+ "type" : " object" ,
105
+ "properties" : {
106
+ "version" : {
107
+ "type" : " string" ,
108
+ "title" : " Backend vLLM version" ,
109
+ "description" : " The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)" ,
110
+ "default" : " v0.4.3"
111
+ }
112
+ }
113
+ }
114
+ }
95
115
}
96
116
}
97
117
}
Original file line number Diff line number Diff line change 51
51
iconUrl : https://raw.githubusercontent.com/vllm-project/vllm/v0.2.7/docs/source/assets/logos/vllm-logo-only-light.png
52
52
description : |
53
53
The raw inference API endpoints for the deployed LLM.
54
+
54
55
# Config for huggingface model cache volume
55
56
# This is mounted at /root/.cache/huggingface in the api deployment
56
57
cacheVolume :
57
58
hostPath :
58
59
path : /tmp/llm/huggingface-cache
60
+
59
61
# Number of gpus to requests for each api pod instance
60
62
# NOTE: This must be in the range 1 <= value <= N, where
61
63
# 'N' is the number of GPUs available in a single
71
73
# to preform a rolling zero-downtime update
72
74
updateStrategy :
73
75
type : Recreate
76
+
77
+ # The value of the vLLM backend's max_model_len argument (if the model's default is not suitable)
78
+ # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server
79
+ modelMaxContextLength :
80
+
74
81
# Extra args to supply to the vLLM backend, see
75
- # https://github.com/ vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py
82
+ # https://docs. vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server
76
83
extraArgs : []
77
84
78
85
# Configuration for the frontend web interface
You can’t perform that action at this time.
0 commit comments