Merge pull request #31 from stackhpc/customisation-options

sd109 · web-flow · commit 311b3a46f27c · 2024-07-12T11:32:04.000+01:00
Additional customisation options for Azimuth app
diff --git a/Tiltfile b/Tiltfile
@@ -1,6 +1,6 @@
 # Toggles whether UI should be run locally using gradio hot-reloading
 # or should be included in the remote Helm install
-run_ui_locally = True
+run_ui_locally = os.getenv("AZIMUTH_LLM_TILT_LOCAL_UI", True)
 
 # Allow non-local contexts
 allow_k8s_contexts(k8s_context())
diff --git a/chart/azimuth-ui.schema.yaml b/chart/azimuth-ui.schema.yaml
@@ -10,10 +10,25 @@ controls:
     type: MirrorControl
     path: /huggingface/model
     visuallyHidden: true
+  # Azimuth UI doesn't handle json type ["integer","null"]
+  # properly so we allow any type in JSON schema then
+  # constrain to (optional) integer here.
+  /api/modelMaxContextLength:
+    type: IntegerControl
+    minimum: 100
+    step: 100
+    required: false
 
 sortOrder:
   - /huggingface/model
   - /huggingface/token
   - /ui/appSettings/hf_model_instruction
   - /ui/appSettings/page_title
+  - /api/image/version
   - /ui/appSettings/llm_temperature
+  - /ui/appSettings/llm_max_tokens
+  - /ui/appSettings/llm_frequency_penalty
+  - /ui/appSettings/llm_presence_penalty
+  - /ui/appSettings/llm_top_p
+  - /ui/appSettings/llm_top_k
+  - /api/modelMaxContextLength
diff --git a/chart/templates/api/deployment.yml b/chart/templates/api/deployment.yml
@@ -29,6 +29,10 @@ spec:
           - --model
           - {{ .Values.huggingface.model }}
           {{- include "azimuth-llm.chatTemplate" . | nindent 10 }}
+          {{- if .Values.api.modelMaxContextLength -}}
+          - --max-model-len
+          - {{ .Values.api.modelMaxContextLength | quote }}
+          {{- end -}}
           {{- if .Values.api.extraArgs -}}
           {{- .Values.api.extraArgs | toYaml | nindent 10 }}
           {{- end -}}
diff --git a/chart/values.schema.json b/chart/values.schema.json
@@ -92,6 +92,26 @@
                     "required": ["hf_model_name", "hf_model_instruction"]
                 }
             }
+        },
+        "api": {
+            "type": "object",
+            "properties": {
+                "modelMaxContextLength": {
+                    "title": "Model Context Length",
+                    "description": "An override for the maximum context length to allow, if the model's default is not suitable."
+                },
+                "image": {
+                    "type": "object",
+                    "properties": {
+                        "version": {
+                            "type": "string",
+                            "title": "Backend vLLM version",
+                            "description": "The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)",
+                            "default": "v0.4.3"
+                        }
+                    }
+                }
+            }
         }
     }
 }
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -51,11 +51,13 @@ api:
       iconUrl: https://raw.githubusercontent.com/vllm-project/vllm/v0.2.7/docs/source/assets/logos/vllm-logo-only-light.png
       description: |
         The raw inference API endpoints for the deployed LLM.
+
   # Config for huggingface model cache volume
   # This is mounted at /root/.cache/huggingface in the api deployment
   cacheVolume:
     hostPath:
       path: /tmp/llm/huggingface-cache
+
   # Number of gpus to requests for each api pod instance
   # NOTE: This must be in the range 1 <= value <= N, where
   # 'N' is the number of GPUs available in a single
@@ -71,8 +73,13 @@ api:
   # to preform a rolling zero-downtime update
   updateStrategy:
     type: Recreate
+
+  # The value of the vLLM backend's max_model_len argument (if the model's default is not suitable)
+  # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server
+  modelMaxContextLength:
+
   # Extra args to supply to the vLLM backend, see
-  # https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py
+  # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server
   extraArgs: []
 
 # Configuration for the frontend web interface

Original file line number	Diff line number	Diff line change
`@@ -92,6 +92,26 @@`
`92`	`92`	`"required": ["hf_model_name", "hf_model_instruction"]`
`93`	`93`	`}`
`94`	`94`	`}`
	`95`	`+ },`
	`96`	`+ "api": {`
	`97`	`+ "type": "object",`
	`98`	`+ "properties": {`
	`99`	`+ "modelMaxContextLength": {`
	`100`	`+ "title": "Model Context Length",`
	`101`	`+ "description": "An override for the maximum context length to allow, if the model's default is not suitable."`
	`102`	`+ },`
	`103`	`+ "image": {`
	`104`	`+ "type": "object",`
	`105`	`+ "properties": {`
	`106`	`+ "version": {`
	`107`	`+ "type": "string",`
	`108`	`+ "title": "Backend vLLM version",`
	`109`	`+ "description": "The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)",`
	`110`	`+ "default": "v0.4.3"`
	`111`	`+ }`
	`112`	`+ }`
	`113`	`+ }`
	`114`	`+ }`
`95`	`115`	`}`
`96`	`116`	`}`
`97`	`117`	`}`