Merge pull request #76 from stackhpc/cpu-vllm-ci

sd109 · web-flow · commit d2fcb960db39 · 2025-02-18T13:46:24.000Z
Add support for vLLM CPU backend and improve CI test coverage
diff --git a/.github/workflows/test-pr.yml b/.github/workflows/test-pr.yml
@@ -66,7 +66,7 @@ jobs:
         with:
           cluster_name: ${{ env.CLUSTER_NAME }}
 
-      # NOTE(scott): Since the local Chart.yaml uses "appVersion: latest" and this
+      # NOTE(scott): Since the local Chart.yaml uses "appVersion: latest" and this
       # only gets overwritten to the correct commit SHA during Helm chart build,
       # we need to pull these published images and load them into the kind cluster
       # with the tag correct tag.
diff --git a/charts/azimuth-chat/ci/test-values.yaml b/charts/azimuth-chat/ci/test-values.yaml
@@ -1,16 +1,24 @@
 azimuth-llm:
+  huggingface:
+    # Use the smallest LLM we can find
+    model: &model HuggingFaceTB/SmolLM2-135M-Instruct
   api:
-    enabled: false
+    # CI Kind cluster doesn't have kube-prometheus-stack
+    monitoring:
+      enabled: false
+    # No GPUs in CI runners
+    gpus: 0
   ui:
     service:
       zenith:
         enabled: false
     appSettings:
+      model_name: *model
       # Verify that we can set non-standard LLM params
       llm_params:
         max_tokens: 101
         temperature: 0.1
+        top_k: 2
         top_p: 0.15
-        top_k: 1
         presence_penalty: 0.9
         frequency_penalty: 1
diff --git a/charts/azimuth-image-analysis/ci/test-values.yaml b/charts/azimuth-image-analysis/ci/test-values.yaml
@@ -0,0 +1,23 @@
+azimuth-llm:
+  huggingface:
+    # Use the smallest vision model we can find
+    model: &model HuggingFaceTB/SmolVLM-256M-Instruct
+  api:
+    # CI Kind cluster doesn't have kube-prometheus-stack
+    monitoring:
+      enabled: false
+    # No GPUs in CI runners
+    gpus: 0
+  ui:
+    service:
+      zenith:
+        enabled: false
+    appSettings:
+      model_name: *model
+      # Verify that we can set non-standard LLM params
+      llm_params:
+        max_tokens: 10  # Constrain response tokens to speed up CI test
+        temperature: 0.1
+        top_p: 0.15
+        presence_penalty: 0.9
+        frequency_penalty: 1
diff --git a/charts/azimuth-image-analysis/ci/ui-only-values.yaml b/charts/azimuth-image-analysis/ci/ui-only-values.yaml
diff --git a/charts/azimuth-llm/ci/default-values.yaml b/charts/azimuth-llm/ci/default-values.yaml
@@ -0,0 +1,18 @@
+# This is intended to test the default chart values
+# as close as possible given the constraints of running
+# inside a Kind cluster within a CI runner
+huggingface:
+  # Use the smallest LLM we can find
+  model: &model HuggingFaceTB/SmolLM2-135M-Instruct
+api:
+  # CI Kind cluster doesn't have kube-prometheus-stack
+  monitoring:
+    enabled: false
+  # No GPUs in CI runners
+  gpus: 0
+ui:
+  service:
+    zenith:
+      enabled: false
+  appSettings:
+    model_name: *model
diff --git a/charts/azimuth-llm/ci/no-api-values.yaml b/charts/azimuth-llm/ci/no-api-values.yaml
diff --git a/charts/azimuth-llm/templates/api/deployment.yml b/charts/azimuth-llm/templates/api/deployment.yml
@@ -19,7 +19,8 @@ spec:
     spec:
       containers:
       - name: {{ .Release.Name }}-api
-        image: {{ printf "%s:%s" .Values.api.image.repository .Values.api.image.version }}
+        {{ $imageRepo := .Values.api.image.repository | default (ternary "ghcr.io/stackhpc/vllm-cpu" "vllm-project/vllm" (eq (.Values.api.gpus | int) 0)) -}}
+        image: {{ printf "%s:%s" $imageRepo .Values.api.image.version }}
         ports:
         - name: api
           containerPort: 8000
@@ -29,7 +30,7 @@ spec:
         args:
           - --model
           - {{ .Values.huggingface.model }}
-          {{- include "azimuth-llm.chatTemplate" . | nindent 10 }}
+          {{- include "azimuth-llm.chatTemplate" . | nindent 10 -}}
           {{- if .Values.api.modelMaxContextLength -}}
           - --max-model-len
           - {{ .Values.api.modelMaxContextLength | quote }}
@@ -41,7 +42,7 @@ spec:
           {{- if .Values.api.extraArgs -}}
           {{- .Values.api.extraArgs | toYaml | nindent 10 }}
           {{- end -}}
-        {{- if .Values.huggingface.secretName }}
+        {{- if .Values.huggingface.secretName -}}
         envFrom:
         - secretRef:
             name: {{ .Values.huggingface.secretName }}
diff --git a/charts/azimuth-llm/templates/api/ingress.yml b/charts/azimuth-llm/templates/api/ingress.yml
@@ -16,7 +16,7 @@ spec:
         pathType: Prefix
         backend:
           service:
-            name: {{ .Values.api.service.name }}
+            name: {{ .Release.Name }}-api
             port:
               # Must match Service resource
               number: 80
diff --git a/charts/azimuth-llm/templates/api/service.yml b/charts/azimuth-llm/templates/api/service.yml
@@ -2,7 +2,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: {{ .Values.api.service.name }}
+  name: {{ .Release.Name }}-api
   labels:
     {{- include "azimuth-llm.api-selectorLabels" . | nindent 4 }}
 spec:
diff --git a/charts/azimuth-llm/templates/api/zenith-client.yml b/charts/azimuth-llm/templates/api/zenith-client.yml
@@ -8,7 +8,7 @@ metadata:
 spec:
   reservationName: {{ .Release.Name }}-api
   upstream:
-    serviceName: {{ .Values.api.service.name }}
+    serviceName: {{ .Release.Name }}-api
   auth:
     skip: {{ .Values.api.service.zenith.skipAuth }}
 {{- end -}}
diff --git a/charts/azimuth-llm/templates/test/end-to-end.yml b/charts/azimuth-llm/templates/test/end-to-end.yml
@@ -10,21 +10,23 @@ spec:
     spec:
       containers:
       - name: gradio-client-test
-        {{- /*
-          Use the chat image since we know this contains the gradio_client package
-        */}}
-        image: {{ printf "ghcr.io/stackhpc/azimuth-llm-chat-ui:%s" (default .Chart.AppVersion .Values.ui.image.tag) }}
+        # Assumes that one of the in-repo Gradio apps is used and that
+        # the app includes a `gradio-test-client.py` script.
+        image: {{ printf "%s:%s" .Values.ui.image.repository (default .Chart.AppVersion .Values.ui.image.tag) }}
         imagePullPolicy: IfNotPresent
         command:
         - python
         - gradio-client-test.py
         {{- if .Values.ingress.ui.enabled }}
         - {{ .Values.ingress.host }}{{ .Values.ingress.ui.path }}
         {{- else }}
-        - http://{{ .Values.ui.service.name }}.{{ .Release.Namespace }}.svc
+        - http://{{ .Release.Name }}-ui.{{ .Release.Namespace }}.svc
         {{- end }}
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        tty: true # Make stdout from python visible in k8s logs
       restartPolicy: Never
-  # Allow plenty of retries since downloading
-  # model weights can take a long time.
-  backoffLimit: 10
+  # Handle retries within gradio-test-client script
+  backoffLimit: 1
 {{- end -}}
diff --git a/charts/azimuth-llm/templates/test/web-app.yml b/charts/azimuth-llm/templates/test/web-app.yml
@@ -18,7 +18,7 @@ spec:
         {{- if .Values.ingress.ui.enabled }}
         - {{ .Values.ingress.host | trimPrefix "http://" | trimPrefix "https://" }}{{ .Values.ingress.ui.path }}
         {{- else }}
-        - {{ .Values.ui.service.name }}.{{ .Release.Namespace }}.svc
+        - {{ .Release.Name }}-ui.{{ .Release.Namespace }}.svc
         {{- end }}
         - "80"
       restartPolicy: Never
diff --git a/charts/azimuth-llm/templates/ui/app-config-map.yml b/charts/azimuth-llm/templates/ui/app-config-map.yml
@@ -7,5 +7,6 @@ metadata:
     {{- include "azimuth-llm.labels" . | nindent 4 }}
 data:
   overrides.yml: |
+    {{- $_ := set .Values.ui.appSettings "backend_url" (printf "http://%s-api.%s.svc" .Release.Name .Release.Namespace) }}
     {{- .Values.ui.appSettings | toYaml | nindent 4 }}
 {{- end -}}
diff --git a/charts/azimuth-llm/templates/ui/ingress.yml b/charts/azimuth-llm/templates/ui/ingress.yml
@@ -16,7 +16,7 @@ spec:
         pathType: Prefix
         backend:
           service:
-            name: {{ .Values.ui.service.name }}
+            name: {{ .Release.Name }}-ui
             port:
               # Must match Service resource
               number: 80
diff --git a/charts/azimuth-llm/templates/ui/service.yml b/charts/azimuth-llm/templates/ui/service.yml
@@ -2,7 +2,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: {{ .Values.ui.service.name }}
+  name: {{ .Release.Name }}-ui
   labels:
     {{- include "azimuth-llm.labels" . | nindent 4 }}
 spec:
diff --git a/charts/azimuth-llm/templates/ui/ui-zenith-client.yml b/charts/azimuth-llm/templates/ui/ui-zenith-client.yml
@@ -9,7 +9,7 @@ metadata:
 spec:
   reservationName: {{ .Release.Name }}-ui
   upstream:
-    serviceName: {{ .Values.ui.service.name }}
+    serviceName: {{ .Release.Name }}-ui
   auth:
     skip: {{ .Values.ui.service.zenith.skipAuth }}
 {{- end -}}
diff --git a/charts/azimuth-llm/values.yaml b/charts/azimuth-llm/values.yaml
@@ -33,15 +33,16 @@ api:
   enabled: true
   # Container image config
   image:
-    repository: vllm/vllm-openai
+    # Defaults to vllm/vllm-openai when api.gpus > 0
+    # or ghrc.io/stackhpc/vllm-cpu when api.gpus == 0
+    repository:
     version: v0.7.2
   monitoring:
     enabled: true
   # The number of replicas for the backend deployment
   replicas: 1
   # Service config
   service:
-    name: llm-backend
     type: ClusterIP
     zenith:
       enabled: false
@@ -56,8 +57,7 @@ api:
     hostPath:
       path: /tmp/llm/huggingface-cache
   # Number of gpus to requests for each api pod instance
-  # NOTE: This must be in the range 1 <= value <= N, where
-  # 'N' is the number of GPUs available in a single
+  # NOTE: This must be less than the number of GPUs available in a single
   # worker node on the target Kubernetes cluster.
   # NOTE: According to the vLLM docs found here
   # https://docs.vllm.ai/en/latest/serving/distributed_serving.html
@@ -83,7 +83,7 @@ ui:
   # Container image config
   image:
     repository: ghcr.io/stackhpc/azimuth-llm-chat-ui
-    tag: # Defaults to chart's appVersion
+    tag:  # Defaults to chart's appVersion
     imagePullPolicy:
   # The settings to be passed to the frontend web app.
   # Format depends on the chosen UI image above. For each of the UIs
@@ -105,7 +105,6 @@ ui:
         - Arial
   # Service config
   service:
-    name: web-app
     type: ClusterIP
     zenith:
       enabled: true
diff --git a/ct.yaml b/ct.yaml
@@ -6,3 +6,6 @@ validate-maintainers: false
 all: true
 # Split output to make it look nice in GitHub Actions tab
 github-groups: true
+# Allow for long running install and test processes
+# (e.g. downloading containers images and model weights)
+helm-extra-args: --timeout 1200s
diff --git a/web-apps/chat/app.py b/web-apps/chat/app.py
@@ -61,7 +61,6 @@ class PossibleSystemPromptException(Exception):
     streaming=True,
 )
 
-
 def inference(latest_message, history):
     # Allow mutating global variable
     global BACKEND_INITIALISED
diff --git a/web-apps/chat/defaults.yml b/web-apps/chat/defaults.yml
@@ -30,6 +30,6 @@ theme_params: {}
 theme_params_extended: {}
 
 # Additional CSS and JS overrides
-# See https://www.gradio.app/guides/custom-CSS-and-JS
+# See https://www.gradio.app/guides/custom-CSS-and-JS
 css_overrides:
 custom_javascript:
diff --git a/web-apps/chat/gradio-client-test.py b/web-apps/chat/gradio-client-test.py
@@ -1,7 +1,21 @@
 import sys
+import time
+
 from gradio_client import Client
 
 gradio_host = sys.argv[1]
-client = Client(gradio_host)
-result = client.predict("Hi", api_name="/chat")
-print(result)
+
+retries = 60
+for n in range(1, retries+1):
+    try:
+        client = Client(gradio_host)
+        result = client.predict("Hi", api_name="/chat")
+        print(result)
+        break
+    except Exception as err:
+        msg = f"Attempt {n} / {retries} encounter error: {err}"
+        if n < retries:
+            print(msg, "- waiting 10 seconds before retrying")
+            time.sleep(10)
+        else:
+            print(msg, "- no more retries left")
diff --git a/web-apps/image-analysis/gradio-client-test.py b/web-apps/image-analysis/gradio-client-test.py
@@ -0,0 +1,25 @@
+import sys
+import time
+
+from gradio_client import Client
+
+gradio_host = sys.argv[1]
+
+retries = 60
+for n in range(1, retries+1):
+    try:
+        client = Client(gradio_host)
+        result = client.predict(
+            image_url="https://media.licdn.com/dms/image/v2/D4D0BAQHyxNra6_PoUQ/company-logo_200_200/company-logo_200_200/0/1704365018113/stackhpc_ltd_logo?e=1747872000&v=beta&t=Ed3-KZS-sHlg-ne1KC0YjI4Ez7yVvJzWr103nm5eVK0",
+    		prompt="Hi",
+    		api_name="/predict"
+        )
+        print(result)
+        break
+    except Exception as err:
+        msg = f"Attempt {n} / {retries} encounter error: {err}"
+        if n < retries:
+            print(msg, "- waiting 10 seconds before retrying")
+            time.sleep(10)
+        else:
+            print(msg, "- no more retries left")
diff --git a/web-apps/utils/utils.py b/web-apps/utils/utils.py
@@ -43,28 +43,6 @@ class LLMParams(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
 
-NAMESPACE_FILE_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
-
-
-def get_k8s_namespace():
-    try:
-        current_k8s_namespace = open(NAMESPACE_FILE_PATH).read()
-        return current_k8s_namespace
-    except FileNotFoundError:
-        return None
-
-
-def api_address_in_cluster():
-    k8s_ns = get_k8s_namespace()
-    if k8s_ns:
-        return f"http://llm-backend.{k8s_ns}.svc"
-    else:
-        log.warning(
-            "Failed to determine k8s namespace from %s - assuming non-kubernetes environment.",
-            NAMESPACE_FILE_PATH,
-        )
-
-
 # Method for loading settings from files
 def load_yaml(file_path: str) -> dict:
     with open(file_path, "r") as file:
@@ -95,12 +73,9 @@ def load_settings() -> dict:
             "Please check for typos"
         )
     settings = {**defaults, **overrides}
-    if "backend_url" not in settings or not settings["backend_url"]:
-        # Try to detect in-cluster address
-        in_cluster_backend = api_address_in_cluster()
-        if not in_cluster_backend:
-            raise Exception(
-                "Backend URL must be provided in settings when running outside of Kubernetes."
-            )
-        settings["backend_url"] = in_cluster_backend
+
+    if "backend_url" not in settings:
+        raise Exception(
+            "Backend URL must be provided in settings when running outside of Kubernetes."
+        )
     return settings