diff --git a/.github/workflows/build-push-artifacts.yml b/.github/workflows/build-push-artifacts.yml index 10d5c1e..73d8370 100644 --- a/.github/workflows/build-push-artifacts.yml +++ b/.github/workflows/build-push-artifacts.yml @@ -28,18 +28,24 @@ jobs: images: - 'web-apps/**' chart: - - 'chart/**' + - 'charts/**' # Job to build container images build_push_images: name: Build and push images runs-on: ubuntu-latest + permissions: + contents: read + id-token: write # needed for signing the images with GitHub OIDC Token + packages: write # required for pushing container images + security-events: write # required for pushing SARIF files needs: changes - if: ${{ needs.changes.outputs.images == 'true' || github.ref_type == 'tag' }} + if: ${{ github.ref_type == 'tag' || needs.changes.outputs.images == 'true' }} strategy: matrix: include: - - component: chat-interface + - component: chat + - component: image-analysis steps: - name: Check out the repository uses: actions/checkout@v4 @@ -55,7 +61,7 @@ jobs: id: image-meta uses: docker/metadata-action@v5 with: - images: ghcr.io/stackhpc/azimuth-llm-${{ matrix.component }} + images: ghcr.io/stackhpc/azimuth-llm-${{ matrix.component }}-ui # Produce the branch name or tag and the SHA as tags tags: | type=ref,event=branch @@ -63,10 +69,11 @@ jobs: type=sha,prefix= - name: Build and push image - uses: azimuth-cloud/github-actions/docker-multiarch-build-push@update-trivy-action + uses: azimuth-cloud/github-actions/docker-multiarch-build-push@master with: cache-key: ${{ matrix.component }} - context: ./web-apps/${{ matrix.component }} + context: ./web-apps/ + file: ./web-apps/${{ matrix.component }}/Dockerfile platforms: linux/amd64,linux/arm64 push: true tags: ${{ steps.image-meta.outputs.tags }} @@ -78,7 +85,7 @@ jobs: runs-on: ubuntu-latest # Only build and push the chart if chart files have changed needs: [changes] - if: ${{ needs.changes.outputs.chart == 'true' || github.ref_type == 'tag' }} + if: ${{ github.ref_type == 'tag' || needs.changes.outputs.chart == 'true' }} steps: - name: Check out the repository uses: actions/checkout@v4 @@ -94,6 +101,7 @@ jobs: - name: Publish Helm charts uses: azimuth-cloud/github-actions/helm-publish@master with: + directory: charts token: ${{ secrets.GITHUB_TOKEN }} version: ${{ steps.semver.outputs.version }} app-version: ${{ steps.semver.outputs.short-sha }} diff --git a/.github/workflows/test-pr.yml b/.github/workflows/test-pr.yml index 1187e7e..a064702 100644 --- a/.github/workflows/test-pr.yml +++ b/.github/workflows/test-pr.yml @@ -28,10 +28,6 @@ jobs: - name: Run chart linting run: ct lint --config ct.yaml - - name: Run helm template with default values - run: helm template ci-test . - working-directory: chart - - name: Create Kind Cluster uses: helm/kind-action@v1 with: diff --git a/.gitignore b/.gitignore index 7d21b1b..b6862d1 100644 --- a/.gitignore +++ b/.gitignore @@ -11,5 +11,11 @@ test-values.y[a]ml **venv*/ # Helm chart stuff -chart/Chart.lock -chart/charts +charts/*/Chart.lock +charts/*/charts + +# Python stuff +**/build/ +**/*.egg-info/ +**/flagged/ +web-apps/**/overrides.yml diff --git a/README.md b/README.md index 58b0812..98068a8 100644 --- a/README.md +++ b/README.md @@ -34,38 +34,36 @@ ui: enabled: false ``` -***Warning*** - Exposing the services in this way provides no authentication mechanism and anyone with access to the load balancer IPs will be able to query the language model. It is up to you to secure the running service in your own way. In contrast, when deploying via Azimuth, authentication is provided via the standard Azimuth Identity Provider mechanisms and the authenticated services are exposed via [Zenith](https://github.com/stackhpc/zenith). +[!WARNING] Exposing the services in this way provides no authentication mechanism and anyone with access to the load balancer IPs will be able to query the language model. It is up to you to secure the running service as appropriate for your use case. In contrast, when deployed via Azimuth, authentication is provided via the standard Azimuth Identity Provider mechanisms and the authenticated services are exposed via [Zenith](https://github.com/stackhpc/zenith). -The UI can also optionally be exposed using a Kubernetes Ingress resource. See the `ui.ingress` section in `values.yml` for available config options. +The both the web-based interface and the backend OpenAI-compatible vLLM API server can also optionally be exposed using [Kubernetes Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/). See the `ingress` section in `values.yml` for available config options. ## Tested Models -The following is a non-exhaustive list of models which have been tested with this app: -- [Llama 2 7B chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) -- [AWQ Quantized Llama 2 70B](https://huggingface.co/TheBloke/Llama-2-70B-Chat-AWQ) -- [Magicoder 6.7B](https://huggingface.co/ise-uiuc/Magicoder-S-DS-6.7B) -- [Mistral 7B Instruct v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) -- [WizardCoder Python 34B](https://huggingface.co/WizardLM/WizardCoder-Python-34B-V1.0) -- [AWQ Quantized Mixtral 8x7B Instruct v0.1](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ) +The application uses [vLLM](https://docs.vllm.ai/en/latest/index.html) for model serving, therefore any of the vLLM [supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) should work. Since vLLM pulls the model files directly from [HuggingFace](https://huggingface.co/models) it is likely that some other models will also be compatible with vLLM but mileage may vary between models and model architectures. If a model is incompatible with vLLM then the API pod will likely enter a `CrashLoopBackoff` state and any relevant error information will be found in the API pod logs. These logs can be viewed with -Due to the combination of [components](##Components) used in this app, some HuggingFace models may not work as expected (usually due to the way in which LangChain formats the prompt messages). Any errors when using a new model will appear in the logs for either the web-app pod or the backend API pod. Please open an issue if you would like explicit support for a specific model that is not in the above list. +``` +kubectl (-n ) logs deploy/-api +``` + +If you suspect that a given error is not caused by the upstream vLLM support and a problem with this Helm chart then please [open an issue](https://github.com/stackhpc/azimuth-llm/issues). ## Monitoring -The LLM chart integrates with [kube-prometheus-stack](https://artifacthub.io/packages/helm/prometheus-community/kube-prometheus-stack) by creating a `ServiceMonitor` resource and installing a custom Grafana dashboard as a Kubernetes `ConfigMap`. If the target cluster has an existing `kube-prometheus-stack` deployment which is appropriately configured to watch all namespaces for new Grafana dashboards, the custom LLM dashboard provided here will automatically picked up by Grafana. It will appear in the Grafana dashboard list with the name 'LLM dashboard'. +The LLM chart integrates with [kube-prometheus-stack](https://artifacthub.io/packages/helm/prometheus-community/kube-prometheus-stack) by creating a `ServiceMonitor` resource and installing two custom Grafana dashboard as Kubernetes `ConfigMap`s. If the target cluster has an existing `kube-prometheus-stack` deployment which is appropriately configured to watch all namespaces for new Grafana dashboards, the LLM dashboards will automatically appear in Grafana's dashboard list. To disable the monitoring integrations, set the `api.monitoring.enabled` value to `false`. ## Components The Helm chart consists of the following components: -- A backend web API which runs [vLLM](https://github.com/vllm-project/vllm)'s [OpenAI compatible web server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server). +- A backend web API which runs [vLLM](https://github.com/vllm-project/vllm)'s [OpenAI compatible web server](https://docs.vllm.ai/en/stable/getting_started/quickstart.html#openai-compatible-server). -- A frontend web-app built using [Gradio](https://www.gradio.app) and [LangChain](https://www.langchain.com). The web app source code can be found in `chart/web-app` and gets written to a ConfigMap during the chart build and is then mounted into the UI pod and executed as the entry point for the UI docker image (built from `images/ui-base/Dockerfile`). +- A choice of frontend web-apps built using [Gradio](https://www.gradio.app) (see [web-apps](./web-apps/)). Each web interface is available as a pre-built container image [hosted on ghcr.io](https://github.com/orgs/stackhpc/packages?repo_name=azimuth-llm) and be configured for each Helm release by changing the `ui.image` section of the chart values. -- A [stakater/Reloader](https://github.com/stakater/Reloader) instance which monitors the web-app ConfigMap for changes and restarts the frontend when the app code changes (i.e. whenever the Helm values are updated). + + + diff --git a/chart/azimuth-ui.schema.yaml b/chart/azimuth-ui.schema.yaml deleted file mode 100644 index de283f1..0000000 --- a/chart/azimuth-ui.schema.yaml +++ /dev/null @@ -1,34 +0,0 @@ -controls: - /huggingface/model: - type: TextControl - required: true - /huggingface/token: - type: TextControl - secret: true - # Use mirror to mimic yaml anchor in base Helm chart - /ui/appSettings/hf_model_name: - type: MirrorControl - path: /huggingface/model - visuallyHidden: true - # Azimuth UI doesn't handle json type ["integer","null"] - # properly so we allow any type in JSON schema then - # constrain to (optional) integer here. - /api/modelMaxContextLength: - type: IntegerControl - minimum: 100 - step: 100 - required: false - -sortOrder: - - /huggingface/model - - /huggingface/token - - /ui/appSettings/hf_model_instruction - - /ui/appSettings/page_title - - /api/image/version - - /ui/appSettings/llm_temperature - - /ui/appSettings/llm_max_tokens - - /ui/appSettings/llm_frequency_penalty - - /ui/appSettings/llm_presence_penalty - - /ui/appSettings/llm_top_p - - /ui/appSettings/llm_top_k - - /api/modelMaxContextLength diff --git a/chart/values.schema.json b/chart/values.schema.json deleted file mode 100644 index 8d20cf7..0000000 --- a/chart/values.schema.json +++ /dev/null @@ -1,124 +0,0 @@ -{ - "$schema": "http://json-schema.org/schema#", - "type": "object", - "properties": { - "huggingface": { - "type": "object", - "properties": { - "model": { - "type": "string", - "title": "Model", - "description": "The [HuggingFace model](https://huggingface.co/models) to deploy (see [here](https://github.com/stackhpc/azimuth-llm?tab=readme-ov-file#tested-models) for a list of tested models).", - "default": "microsoft/Phi-3.5-mini-instruct" - }, - "token": { - "type": [ - "string", - "null" - ], - "title": "Access Token", - "description": "A HuggingFace [access token](https://huggingface.co/docs/hub/security-tokens). Required for [gated models](https://huggingface.co/docs/hub/en/models-gated) (e.g. Llama 3)." - } - }, - "required": [ - "model" - ] - }, - "ui": { - "type": "object", - "properties": { - "appSettings": { - "type": "object", - "properties": { - "hf_model_name": { - "type": "string", - "title": "Model Name", - "description": "Model name supplied to the OpenAI client in frontend web app. Should match huggingface.model above." - }, - "hf_model_instruction": { - "type": "string", - "title": "Instruction", - "description": "The initial system prompt (i.e. the hidden instruction) to use when generating responses.", - "default": "You are a helpful AI assistant. Please respond appropriately." - }, - "page_title": { - "type": "string", - "title": "Page Title", - "description": "The title to display at the top of the chat interface.", - "default": "Large Language Model" - }, - "llm_max_tokens": { - "type": "integer", - "title": "Max Tokens", - "description": "The maximum number of new [tokens](https://platform.openai.com/docs/api-reference/chat/create#chat-create-max_tokens) to generate for each LLM responses.", - "default": 1000 - }, - "llm_temperature": { - "type": "number", - "title": "LLM Temperature", - "description": "The [temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature) value to use when generating LLM responses.", - "default": 0, - "minimum": 0, - "maximum": 2 - }, - "llm_top_p": { - "type": "number", - "title": "LLM Top P", - "description": "The [top p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) value to use when generating LLM responses.", - "default": 1, - "exclusiveMinimum": 0, - "maximum": 1 - }, - "llm_top_k": { - "type": "integer", - "title": "LLM Top K", - "description": "The [top k](https://docs.vllm.ai/en/stable/dev/sampling_params.html) value to use when generating LLM responses (must be an integer).", - "default": -1, - "minimum": -1 - }, - "llm_presence_penalty": { - "type": "number", - "title": "LLM Presence Penalty", - "description": "The [presence penalty](https://platform.openai.com/docs/api-reference/chat/create#chat-create-presence_penalty) to use when generating LLM responses.", - "default": 0, - "minimum": -2, - "maximum": 2 - }, - "llm_frequency_penalty": { - "type": "number", - "title": "LLM Frequency Penalty", - "description": "The [frequency_penalty](https://platform.openai.com/docs/api-reference/chat/create#chat-create-frequency_penalty) to use when generating LLM responses.", - "default": 0, - "minimum": -2, - "maximum": 2 - } - }, - "required": [ - "hf_model_name", - "hf_model_instruction" - ] - } - } - }, - "api": { - "type": "object", - "properties": { - "modelMaxContextLength": { - "title": "Model Context Length", - "description": "An override for the maximum context length to allow, if the model's default is not suitable." - }, - "image": { - "type": "object", - "properties": { - "version": { - "type": "string", - "title": "Backend vLLM version", - "description": "The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)", - "default": "v0.6.3" - } - } - } - } - } - } -} diff --git a/charts/azimuth-chat/Chart.yaml b/charts/azimuth-chat/Chart.yaml new file mode 100644 index 0000000..97dd341 --- /dev/null +++ b/charts/azimuth-chat/Chart.yaml @@ -0,0 +1,22 @@ +apiVersion: v2 +name: azimuth-llm-chat +description: HuggingFace vision model serving along with a simple web interface. +maintainers: + - name: "Scott Davidson" + email: scott@stackhpc.com + +type: application + +version: 0.1.0 + +appVersion: "0.1.0" + +icon: https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg + +annotations: + azimuth.stackhpc.com/label: HuggingFace Image Analysis + +dependencies: + - name: azimuth-llm + version: ">=0-0" + repository: "file://../azimuth-llm/" diff --git a/charts/azimuth-chat/azimuth-ui.schema.yaml b/charts/azimuth-chat/azimuth-ui.schema.yaml new file mode 100644 index 0000000..74bd573 --- /dev/null +++ b/charts/azimuth-chat/azimuth-ui.schema.yaml @@ -0,0 +1,33 @@ +controls: + /azimuth-llm/huggingface/model: + type: TextControl + required: true + /azimuth-llm/huggingface/token: + type: TextControl + secret: true + # Use mirror to mimic yaml anchor in base Helm chart + /azimuth-llm/ui/appSettings/model_name: + type: MirrorControl + path: /azimuth-llm/huggingface/model + visuallyHidden: true + # Azimuth UI doesn't handle json type ["integer","null"] + # properly so we allow any type in JSON schema then + # constrain to (optional) integer here. + /azimuth-llm/api/modelMaxContextLength: + type: IntegerControl + minimum: 100 + required: false + +sortOrder: + - /azimuth-llm/huggingface/model + - /azimuth-llm/huggingface/token + - /azimuth-llm/ui/appSettings/model_instruction + - /azimuth-llm/ui/appSettings/page_title + - /azimuth-llm/api/image/version + - /azimuth-llm/ui/appSettings/llm_params/temperature + - /azimuth-llm/ui/appSettings/llm_params/max_tokens + - /azimuth-llm/ui/appSettings/llm_params/frequency_penalty + - /azimuth-llm/ui/appSettings/llm_params/presence_penalty + - /azimuth-llm/ui/appSettings/llm_params/top_p + - /azimuth-llm/ui/appSettings/llm_params/top_k + - /azimuth-llm/api/modelMaxContextLength diff --git a/charts/azimuth-chat/ci/ui-only-values.yaml b/charts/azimuth-chat/ci/ui-only-values.yaml new file mode 100644 index 0000000..b66347d --- /dev/null +++ b/charts/azimuth-chat/ci/ui-only-values.yaml @@ -0,0 +1,16 @@ +azimuth-llm: + api: + enabled: false + ui: + service: + zenith: + enabled: false + appSettings: + # Verify that we can set non-standard LLM params + llm_params: + max_tokens: 101 + temperature: 0.1 + top_p: 0.15 + top_k: 1 + presence_penalty: 0.9 + frequency_penalty: 1 diff --git a/charts/azimuth-chat/values.schema.json b/charts/azimuth-chat/values.schema.json new file mode 100644 index 0000000..ebc2622 --- /dev/null +++ b/charts/azimuth-chat/values.schema.json @@ -0,0 +1,133 @@ +{ + "type": "object", + "properties": { + "azimuth-llm": { + "type": "object", + "properties": { + "huggingface": { + "type": "object", + "properties": { + "model": { + "type": "string", + "title": "Model", + "description": "The [HuggingFace model](https://huggingface.co/models) to deploy (see [here](https://github.com/stackhpc/azimuth-llm?tab=readme-ov-file#tested-models) for a list of tested models).", + "default": "microsoft/Phi-3.5-mini-instruct" + }, + "token": { + "type": [ + "string", + "null" + ], + "title": "Access Token", + "description": "A HuggingFace [access token](https://huggingface.co/docs/hub/security-tokens). Required for [gated models](https://huggingface.co/docs/hub/en/models-gated) (e.g. Llama 3)." + } + }, + "required": [ + "model" + ] + }, + "api": { + "type": "object", + "properties": { + "modelMaxContextLength": { + "title": "Model Context Length", + "description": "An override for the maximum context length to allow, if the model's default is not suitable." + }, + "image": { + "type": "object", + "properties": { + "version": { + "type": "string", + "title": "Backend vLLM version", + "description": "The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)", + "default": "v0.6.3" + } + } + } + } + }, + "ui": { + "type": "object", + "properties": { + "appSettings": { + "type": "object", + "properties": { + "model_name": { + "type": "string", + "title": "Model Name", + "description": "Model name supplied to the OpenAI client in frontend web app. Should match huggingface.model above." + }, + "model_instruction": { + "type": "string", + "title": "Instruction", + "description": "The initial system prompt (i.e. the hidden instruction) to use when generating responses.", + "default": "You are a helpful AI assistant. Please respond appropriately." + }, + "page_title": { + "type": "string", + "title": "Page Title", + "description": "The title to display at the top of the chat interface.", + "default": "Large Language Model" + }, + "llm_params": { + "type": "object", + "properties": { + "max_tokens": { + "type": "integer", + "title": "Max Tokens", + "description": "The maximum number of new [tokens](https://platform.openai.com/docs/api-reference/chat/create#chat-create-max_tokens) to generate for each LLM responses.", + "default": 1000 + }, + "temperature": { + "type": "number", + "title": "LLM Temperature", + "description": "The [temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature) value to use when generating LLM responses.", + "default": 0, + "minimum": 0, + "maximum": 2 + }, + "top_p": { + "type": "number", + "title": "LLM Top P", + "description": "The [top p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) value to use when generating LLM responses.", + "default": 1, + "exclusiveMinimum": 0, + "maximum": 1 + }, + "top_k": { + "type": "integer", + "title": "LLM Top K", + "description": "The [top k](https://docs.vllm.ai/en/stable/dev/sampling_params.html) value to use when generating LLM responses (must be an integer).", + "default": -1, + "minimum": -1 + }, + "presence_penalty": { + "type": "number", + "title": "LLM Presence Penalty", + "description": "The [presence penalty](https://platform.openai.com/docs/api-reference/chat/create#chat-create-presence_penalty) to use when generating LLM responses.", + "default": 0, + "minimum": -2, + "maximum": 2 + }, + "frequency_penalty": { + "type": "number", + "title": "LLM Frequency Penalty", + "description": "The [frequency_penalty](https://platform.openai.com/docs/api-reference/chat/create#chat-create-frequency_penalty) to use when generating LLM responses.", + "default": 0, + "minimum": -2, + "maximum": 2 + } + } + } + }, + "required": [ + "model_name", + "model_instruction" + ] + } + } + } + } + } + } +} diff --git a/charts/azimuth-chat/values.yaml b/charts/azimuth-chat/values.yaml new file mode 100644 index 0000000..9a17317 --- /dev/null +++ b/charts/azimuth-chat/values.yaml @@ -0,0 +1,9 @@ +azimuth-llm: + huggingface: + model: &model-name microsoft/Phi-3.5-mini-instruct + ui: + image: + repository: ghcr.io/stackhpc/azimuth-llm-chat-ui + appSettings: + model_name: *model-name + model_instruction: "You are a helpful AI assistant; please respond appropriately." diff --git a/charts/azimuth-image-analysis/Chart.yaml b/charts/azimuth-image-analysis/Chart.yaml new file mode 100644 index 0000000..238016b --- /dev/null +++ b/charts/azimuth-image-analysis/Chart.yaml @@ -0,0 +1,22 @@ +apiVersion: v2 +name: azimuth-llm-image-analysis +description: HuggingFace vision model serving along with a simple web interface. +maintainers: + - name: "Scott Davidson" + email: scott@stackhpc.com + +type: application + +version: 0.1.0 + +appVersion: "0.1.0" + +icon: https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg + +annotations: + azimuth.stackhpc.com/label: HuggingFace Image Analysis + +dependencies: + - name: azimuth-llm + version: ">=0-0" + repository: "file://../azimuth-llm/" diff --git a/charts/azimuth-image-analysis/azimuth-ui.schema.yaml b/charts/azimuth-image-analysis/azimuth-ui.schema.yaml new file mode 100644 index 0000000..a960081 --- /dev/null +++ b/charts/azimuth-image-analysis/azimuth-ui.schema.yaml @@ -0,0 +1,33 @@ +controls: + /azimuth-llm/huggingface/model: + type: TextControl + required: true + /azimuth-llm/huggingface/token: + type: TextControl + secret: true + # Use mirror to mimic yaml anchor in base Helm chart + /azimuth-llm/ui/appSettings/model_name: + type: MirrorControl + path: /azimuth-llm/huggingface/model + visuallyHidden: true + # Azimuth UI doesn't handle json type ["integer","null"] + # properly so we allow any type in JSON schema then + # constrain to (optional) integer here. + /azimuth-llm/api/modelMaxContextLength: + type: IntegerControl + minimum: 100 + required: false + +sortOrder: + - /azimuth-llm/huggingface/model + - /azimuth-llm/huggingface/token + - /azimuth-llm/api/image/version + - /azimuth-llm/ui/appSettings/llm_params/temperature + - /azimuth-llm/ui/appSettings/llm_params/max_tokens + - /azimuth-llm/ui/appSettings/llm_params/frequency_penalty + - /azimuth-llm/ui/appSettings/llm_params/presence_penalty + - /azimuth-llm/ui/appSettings/llm_params/top_p + # vLLM responds with HTTP 400 BadRequest when top_k is + # passed to a vision model (but ollama accepts it) + # - /azimuth-llm/ui/appSettings/llm_params/top_k + - /azimuth-llm/api/modelMaxContextLength diff --git a/charts/azimuth-image-analysis/ci/ui-only-values.yaml b/charts/azimuth-image-analysis/ci/ui-only-values.yaml new file mode 100644 index 0000000..96f716d --- /dev/null +++ b/charts/azimuth-image-analysis/ci/ui-only-values.yaml @@ -0,0 +1,15 @@ +azimuth-llm: + api: + enabled: false + ui: + service: + zenith: + enabled: false + appSettings: + # Verify that we can set non-standard LLM params + llm_params: + max_tokens: 101 + temperature: 0.1 + top_p: 0.15 + presence_penalty: 0.9 + frequency_penalty: 1 diff --git a/charts/azimuth-image-analysis/values.schema.json b/charts/azimuth-image-analysis/values.schema.json new file mode 100644 index 0000000..c8be1ac --- /dev/null +++ b/charts/azimuth-image-analysis/values.schema.json @@ -0,0 +1,114 @@ +{ + "type": "object", + "properties": { + "azimuth-llm": { + "type": "object", + "properties": { + "huggingface": { + "type": "object", + "properties": { + "model": { + "type": "string", + "title": "Model", + "description": "The [HuggingFace model](https://huggingface.co/models) to deploy (see [here](https://github.com/stackhpc/azimuth-llm?tab=readme-ov-file#tested-models) for a list of tested models).", + "default": "microsoft/Phi-3.5-vision-instruct" + }, + "token": { + "type": [ + "string", + "null" + ], + "title": "Access Token", + "description": "A HuggingFace [access token](https://huggingface.co/docs/hub/security-tokens). Required for [gated models](https://huggingface.co/docs/hub/en/models-gated) (e.g. Llama 3)." + } + }, + "required": [ + "model" + ] + }, + "api": { + "type": "object", + "properties": { + "image": { + "type": "object", + "properties": { + "version": { + "type": "string", + "title": "Backend vLLM version", + "description": "The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)", + "default": "v0.6.3" + } + } + }, + "modelMaxContextLength": { + "title": "Model Context Length", + "description": "An override for the maximum context length to allow, if the model's default is not suitable." + } + } + }, + "ui": { + "type": "object", + "properties": { + "appSettings": { + "type": "object", + "properties": { + "model_name": { + "type": "string", + "title": "Model Name", + "description": "Model name supplied to the OpenAI client in frontend web app. Should match huggingface.model above." + }, + "llm_params": { + "$comment": "top_k parameter causes vLLM to error for most (all?) vision models so is excluded here", + "type": "object", + "properties": { + "max_tokens": { + "type": "integer", + "title": "Max Tokens", + "description": "The maximum number of new [tokens](https://platform.openai.com/docs/api-reference/chat/create#chat-create-max_tokens) to generate for each LLM responses.", + "default": 1000 + }, + "temperature": { + "type": "number", + "title": "LLM Temperature", + "description": "The [temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature) value to use when generating LLM responses.", + "default": 0, + "minimum": 0, + "maximum": 2 + }, + "top_p": { + "type": "number", + "title": "LLM Top P", + "description": "The [top p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) value to use when generating LLM responses.", + "default": 1, + "exclusiveMinimum": 0, + "maximum": 1 + }, + "presence_penalty": { + "type": "number", + "title": "LLM Presence Penalty", + "description": "The [presence penalty](https://platform.openai.com/docs/api-reference/chat/create#chat-create-presence_penalty) to use when generating LLM responses.", + "default": 0, + "minimum": -2, + "maximum": 2 + }, + "frequency_penalty": { + "type": "number", + "title": "LLM Frequency Penalty", + "description": "The [frequency_penalty](https://platform.openai.com/docs/api-reference/chat/create#chat-create-frequency_penalty) to use when generating LLM responses.", + "default": 0, + "minimum": -2, + "maximum": 2 + } + } + } + }, + "required": [ + "model_name" + ] + } + } + } + } + } + } +} diff --git a/charts/azimuth-image-analysis/values.yaml b/charts/azimuth-image-analysis/values.yaml new file mode 100644 index 0000000..c5a770c --- /dev/null +++ b/charts/azimuth-image-analysis/values.yaml @@ -0,0 +1,8 @@ +azimuth-llm: + huggingface: + model: &model-name microsoft/Phi-3.5-vision-instruct + ui: + image: + repository: ghcr.io/stackhpc/azimuth-llm-image-analysis-ui + appSettings: + model_name: *model-name diff --git a/chart/.helmignore b/charts/azimuth-llm/.helmignore similarity index 100% rename from chart/.helmignore rename to charts/azimuth-llm/.helmignore diff --git a/chart/Chart.yaml b/charts/azimuth-llm/Chart.yaml similarity index 90% rename from chart/Chart.yaml rename to charts/azimuth-llm/Chart.yaml index a6542df..6c92b69 100644 --- a/chart/Chart.yaml +++ b/charts/azimuth-llm/Chart.yaml @@ -30,9 +30,3 @@ icon: https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-l annotations: azimuth.stackhpc.com/label: HuggingFace LLM - -dependencies: - - name: reloader - version: 1.0.63 - repository: https://stakater.github.io/stakater-charts - condition: ui.enabled diff --git a/chart/ci/web-apps-only-values.yaml b/charts/azimuth-llm/ci/no-api-values.yaml similarity index 100% rename from chart/ci/web-apps-only-values.yaml rename to charts/azimuth-llm/ci/no-api-values.yaml diff --git a/chart/templates/NOTES.txt b/charts/azimuth-llm/templates/NOTES.txt similarity index 100% rename from chart/templates/NOTES.txt rename to charts/azimuth-llm/templates/NOTES.txt diff --git a/chart/templates/_helpers.tpl b/charts/azimuth-llm/templates/_helpers.tpl similarity index 100% rename from chart/templates/_helpers.tpl rename to charts/azimuth-llm/templates/_helpers.tpl diff --git a/chart/templates/api/config-map-grafana-dashboard-details.yml b/charts/azimuth-llm/templates/api/config-map-grafana-dashboard-details.yml similarity index 100% rename from chart/templates/api/config-map-grafana-dashboard-details.yml rename to charts/azimuth-llm/templates/api/config-map-grafana-dashboard-details.yml diff --git a/chart/templates/api/config-map-grafana-dashboard-summary.yml b/charts/azimuth-llm/templates/api/config-map-grafana-dashboard-summary.yml similarity index 98% rename from chart/templates/api/config-map-grafana-dashboard-summary.yml rename to charts/azimuth-llm/templates/api/config-map-grafana-dashboard-summary.yml index 9a2002e..672d862 100644 --- a/chart/templates/api/config-map-grafana-dashboard-summary.yml +++ b/charts/azimuth-llm/templates/api/config-map-grafana-dashboard-summary.yml @@ -184,7 +184,8 @@ data: "value": 80 } ] - } + }, + "unit": "locale" }, "overrides": [] }, @@ -279,7 +280,8 @@ data: "value": 80 } ] - } + }, + "unit": "locale" }, "overrides": [] }, @@ -378,7 +380,8 @@ data: "value": 80 } ] - } + }, + "unit": "locale" }, "overrides": [] }, @@ -477,7 +480,8 @@ data: "value": 80 } ] - } + }, + "unit": "locale" }, "overrides": [] }, @@ -518,7 +522,7 @@ data: "useBackend": false } ], - "title": "Tokens Generated (total)", + "title": "Generated Tokens (total)", "type": "timeseries" }, { @@ -814,14 +818,14 @@ data: "list": [] }, "time": { - "from": "now-120d", + "from": "now-90d", "to": "now" }, "timepicker": {}, "timezone": "", - "title": "vLLM Dashboard - Summary", + "title": "Scott test 1", "uid": "ee0cbu8l3b400dasdasfas", - "version": 1, + "version": 5, "weekStart": "" } {{- end -}} diff --git a/chart/templates/api/deployment.yml b/charts/azimuth-llm/templates/api/deployment.yml similarity index 100% rename from chart/templates/api/deployment.yml rename to charts/azimuth-llm/templates/api/deployment.yml diff --git a/chart/templates/api/ingress.yml b/charts/azimuth-llm/templates/api/ingress.yml similarity index 100% rename from chart/templates/api/ingress.yml rename to charts/azimuth-llm/templates/api/ingress.yml diff --git a/chart/templates/api/service-monitor.yml b/charts/azimuth-llm/templates/api/service-monitor.yml similarity index 100% rename from chart/templates/api/service-monitor.yml rename to charts/azimuth-llm/templates/api/service-monitor.yml diff --git a/chart/templates/api/service.yml b/charts/azimuth-llm/templates/api/service.yml similarity index 100% rename from chart/templates/api/service.yml rename to charts/azimuth-llm/templates/api/service.yml diff --git a/chart/templates/api/zenith-client.yml b/charts/azimuth-llm/templates/api/zenith-client.yml similarity index 100% rename from chart/templates/api/zenith-client.yml rename to charts/azimuth-llm/templates/api/zenith-client.yml diff --git a/chart/templates/api/zenith-reservation.yml b/charts/azimuth-llm/templates/api/zenith-reservation.yml similarity index 100% rename from chart/templates/api/zenith-reservation.yml rename to charts/azimuth-llm/templates/api/zenith-reservation.yml diff --git a/chart/templates/test/end-to-end.yml b/charts/azimuth-llm/templates/test/end-to-end.yml similarity index 100% rename from chart/templates/test/end-to-end.yml rename to charts/azimuth-llm/templates/test/end-to-end.yml diff --git a/chart/templates/test/web-app.yml b/charts/azimuth-llm/templates/test/web-app.yml similarity index 100% rename from chart/templates/test/web-app.yml rename to charts/azimuth-llm/templates/test/web-app.yml diff --git a/chart/templates/ui/app-config-map.yml b/charts/azimuth-llm/templates/ui/app-config-map.yml similarity index 100% rename from chart/templates/ui/app-config-map.yml rename to charts/azimuth-llm/templates/ui/app-config-map.yml diff --git a/chart/templates/ui/deployment.yml b/charts/azimuth-llm/templates/ui/deployment.yml similarity index 81% rename from chart/templates/ui/deployment.yml rename to charts/azimuth-llm/templates/ui/deployment.yml index aa52e02..3938893 100644 --- a/chart/templates/ui/deployment.yml +++ b/charts/azimuth-llm/templates/ui/deployment.yml @@ -5,9 +5,6 @@ metadata: name: {{ .Release.Name }}-ui labels: {{- include "azimuth-llm.labels" . | nindent 4 }} - annotations: - # Make sure UI is reloaded when app settings are updated - reloader.stakater.com/auto: "true" spec: replicas: 1 selector: @@ -19,6 +16,10 @@ spec: metadata: labels: {{- include "azimuth-llm.ui-selectorLabels" . | nindent 8 }} + # Restart deployment when settings config map changes + # https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments + annotations: + checksum/config: {{ include (print $.Template.BasePath "/ui/app-config-map.yml") . | sha256sum }} spec: containers: - name: {{ .Release.Name }}-ui diff --git a/chart/templates/ui/ingress.yml b/charts/azimuth-llm/templates/ui/ingress.yml similarity index 100% rename from chart/templates/ui/ingress.yml rename to charts/azimuth-llm/templates/ui/ingress.yml diff --git a/chart/templates/ui/service.yml b/charts/azimuth-llm/templates/ui/service.yml similarity index 100% rename from chart/templates/ui/service.yml rename to charts/azimuth-llm/templates/ui/service.yml diff --git a/chart/templates/ui/ui-zenith-client.yml b/charts/azimuth-llm/templates/ui/ui-zenith-client.yml similarity index 100% rename from chart/templates/ui/ui-zenith-client.yml rename to charts/azimuth-llm/templates/ui/ui-zenith-client.yml diff --git a/chart/templates/ui/ui-zenith-reservation.yml b/charts/azimuth-llm/templates/ui/ui-zenith-reservation.yml similarity index 100% rename from chart/templates/ui/ui-zenith-reservation.yml rename to charts/azimuth-llm/templates/ui/ui-zenith-reservation.yml diff --git a/chart/values.yaml b/charts/azimuth-llm/values.yaml similarity index 91% rename from chart/values.yaml rename to charts/azimuth-llm/values.yaml index 12db8b3..dc3a95f 100644 --- a/chart/values.yaml +++ b/charts/azimuth-llm/values.yaml @@ -80,13 +80,18 @@ api: ui: # Toggles installation of the gradio web UI enabled: true - # The file from the UI config map to execute as the entrypoint to the frontend app - entrypoint: app.py - # The values to be written to settings.yml for parsing as frontend app setting - # (see example_app.py and config.py for example using pydantic-settings to configure app) + # Container image config + image: + repository: ghcr.io/stackhpc/azimuth-llm-chat-ui + version: ef83288 + imagePullPolicy: + # The settings to be passed to the frontend web app. + # Format depends on the chosen UI image above. For each of the UIs + # included in the web-apps/ folder of this git repository there is a + # defaults.yml file (e.g. web-apps/text-chat/defaults.yml) listing all + # available configuration options. appSettings: - hf_model_name: *model-name - hf_model_instruction: "You are a helpful AI assistant. Please response appropriately." + model_name: *model-name # Use local system fonts by default to avoid GDPR issues # with Gradio's defaults fonts which require fetching from # the Google fonts API. To restore default Gradio theme @@ -98,11 +103,6 @@ ui: font_mono: - sans-serif - Arial - # Container image config - image: - repository: ghcr.io/stackhpc/azimuth-llm-chat-interface - version: 87a0342 - imagePullPolicy: # Service config service: name: web-app @@ -149,5 +149,3 @@ ingress: # Annotations to apply to the ingress resource # e.g. for cert-manager integration annotations: -reloader: - watchGlobally: false diff --git a/ct.yaml b/ct.yaml index 866e08c..f5fada9 100644 --- a/ct.yaml +++ b/ct.yaml @@ -1,2 +1,8 @@ +# Complains about invalid maintainer URLs validate-maintainers: false -charts: chart/ +# Skip version bump detection and lint all charts +# since we're using the azimuth-cloud Helm chart publish +# workflow which doesn't use Chart.yaml's version key +all: true +# Split output to make it look nice in GitHub Actions tab +github-groups: true diff --git a/web-apps/build.sh b/web-apps/build.sh index 5fe3c98..0dd5d4a 100755 --- a/web-apps/build.sh +++ b/web-apps/build.sh @@ -2,14 +2,12 @@ set -e build() { - pushd $1 > /dev/null - if [[ -f Dockerfile ]]; then + if [[ -f $1/Dockerfile ]]; then echo Building $1 docker image - docker build . -t ghcr.io/stackhpc/azimuth-llm-$1 + docker build . -t ghcr.io/stackhpc/azimuth-llm-$1 -f $1/Dockerfile else echo No Dockerfile found for $1 fi - popd > /dev/null } # If a single app is provided as a diff --git a/web-apps/chat-interface/config.py b/web-apps/chat-interface/config.py deleted file mode 100644 index 8592884..0000000 --- a/web-apps/chat-interface/config.py +++ /dev/null @@ -1,97 +0,0 @@ -import logging -import yaml -from pydantic import Field, HttpUrl -from pydantic_settings import BaseSettings, SettingsConfigDict - -from typing import Optional, Union, List - -logging.basicConfig() -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -NAMESPACE_FILE_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" -def get_k8s_namespace(): - try: - current_k8s_namespace = open(NAMESPACE_FILE_PATH).read() - return current_k8s_namespace - except FileNotFoundError as err: - return None - -def default_backend(): - k8s_ns = get_k8s_namespace() - if k8s_ns: - return f"http://llm-backend.{k8s_ns}.svc" - else: - logger.warning('Failed to determine k8s namespace from %s - assuming non-kubernetes environment.', NAMESPACE_FILE_PATH) - - -class AppSettings(BaseSettings): - """ - Settings object for the UI example app. - """ - - # # Allow settings to be overwritten by LLM_UI_ env vars - # model_config = SettingsConfigDict(env_prefix="llm_ui_") - - # General settings - hf_model_name: str = Field( - description="The model to use when constructing the LLM Chat client. This should match the model name running on the vLLM backend", - ) - backend_url: HttpUrl = Field( - description="The address of the OpenAI compatible API server (either in-cluster or externally hosted)" - ) - page_title: str = Field(default="Large Language Model") - page_description: Optional[str] = Field(default=None) - hf_model_instruction: str = Field( - default="You are a helpful and cheerful AI assistant. Please respond appropriately." - ) - - # Model settings - - # For available parameters, see https://docs.vllm.ai/en/latest/dev/sampling_params.html - # which is based on https://platform.openai.com/docs/api-reference/completions/create - llm_max_tokens: int = Field(default=500) - llm_temperature: float = Field(default=0) - llm_top_p: float = Field(default=1) - llm_top_k: float = Field(default=-1) - llm_presence_penalty: float = Field(default=0, ge=-2, le=2) - llm_frequency_penalty: float = Field(default=0, ge=-2, le=2) - - # UI theming - - # Variables explicitly passed to gradio.theme.Default() - # For example: - # {"primary_hue": "red"} - theme_params: dict[str, Union[str, List[str]]] = Field(default_factory=dict) - # Overrides for theme.body_background_fill property - theme_background_colour: Optional[str] = Field(default=None) - # Provides arbitrary CSS and JS overrides to the UI, - # see https://www.gradio.app/guides/custom-CSS-and-JS - css_overrides: Optional[str] = Field(default=None) - custom_javascript: Optional[str] = Field(default=None) - - - # Method for loading settings from files - @staticmethod - def _load_yaml(file_path: str): - with open(file_path, "r") as file: - content = yaml.safe_load(file) or {} - return content - - @staticmethod - def load(): - defaults = AppSettings._load_yaml('./defaults.yml') - overrides = {} - try: - overrides = AppSettings._load_yaml('/etc/web-app/overrides.yml') - except FileNotFoundError: - pass - settings = {**defaults, **overrides} - # Sanity checks on settings - if 'backend_url' not in settings: - in_cluster_backend = default_backend() - if not in_cluster_backend: - raise Exception('Backend URL must be provided in settings when running this app outside of Kubernetes') - settings['backend_url'] = in_cluster_backend - return AppSettings(**settings) diff --git a/web-apps/chat-interface/defaults.yml b/web-apps/chat-interface/defaults.yml deleted file mode 100644 index 9520b39..0000000 --- a/web-apps/chat-interface/defaults.yml +++ /dev/null @@ -1,36 +0,0 @@ - -hf_model_name: "microsoft/Phi-3.5-mini-instruct" -hf_model_instruction: "You are a pirate" - -# UI theming tweaks -# css_overrides: | -# h1 { -# color: white; -# padding-top: 1em; -# } -# a { -# color: yellow; -# } -# theme_background_colour: "#00376c" -# theme_params: -# # primary_hue: blue -# font: -# - sans-serif -# font_mono: -# - sans-serif - -# custom_javascript: | -# function addPrivacyStatement() { -# var footer = document.querySelector('footer'); -# footer.appendChild(footer.children[1].cloneNode(deep=true)); -# var item = footer.children[2].cloneNode(); -# item.href = 'https://google.com'; -# item.textContent = 'Privacy Statement'; -# footer.appendChild(item); -# } - -# llm_max_tokens: -# llm_temperature: -# llm_top_p: -# llm_frequency_penalty: -# llm_presence_penalty: diff --git a/web-apps/chat-interface/Dockerfile b/web-apps/chat/Dockerfile similarity index 63% rename from web-apps/chat-interface/Dockerfile rename to web-apps/chat/Dockerfile index 803d58f..c963b29 100644 --- a/web-apps/chat-interface/Dockerfile +++ b/web-apps/chat/Dockerfile @@ -1,6 +1,9 @@ FROM python:3.11-slim -COPY requirements.txt requirements.txt +ARG DIR=chat + +COPY $DIR/requirements.txt requirements.txt +COPY utils utils RUN pip install --no-cache-dir -r requirements.txt COPY purge-google-fonts.sh purge-google-fonts.sh @@ -8,9 +11,8 @@ RUN bash purge-google-fonts.sh WORKDIR /app -COPY *.py . +COPY $DIR/*.py . -COPY defaults.yml . -# COPY overrides.yml . +COPY $DIR/defaults.yml . ENTRYPOINT ["python3", "app.py"] diff --git a/web-apps/chat-interface/app.py b/web-apps/chat/app.py similarity index 73% rename from web-apps/chat-interface/app.py rename to web-apps/chat/app.py index 3ead467..8894fef 100644 --- a/web-apps/chat-interface/app.py +++ b/web-apps/chat/app.py @@ -1,23 +1,39 @@ -import sys import logging +import openai + import gradio as gr -from urllib.parse import urljoin -from config import AppSettings +from urllib.parse import urljoin from langchain.schema import HumanMessage, AIMessage, SystemMessage from langchain_openai import ChatOpenAI -import openai +from typing import Dict, List +from pydantic import BaseModel, ConfigDict +from utils import LLMParams, load_settings logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -logger.info("Starting app") -settings = AppSettings.load() -if len(sys.argv) > 1: - settings.hf_model_name = sys.argv[1] -logger.info("App settings: %s", settings) +class AppSettings(BaseModel): + # Basic config + host_address: str + backend_url: str + model_name: str + model_instruction: str + page_title: str + llm_params: LLMParams + # Theme customisation + theme_params: Dict[str, str | list] + theme_params_extended: Dict[str, str] + css_overrides: str | None + custom_javascript: str | None + # Error on typos and suppress warnings for fields with 'model_' prefix + model_config = ConfigDict(protected_namespaces=(), extra="forbid") + + +settings = AppSettings(**load_settings()) +logger.info(settings) backend_url = str(settings.backend_url) backend_health_endpoint = urljoin(backend_url, "/health") @@ -36,26 +52,15 @@ class PossibleSystemPromptException(Exception): llm = ChatOpenAI( base_url=urljoin(backend_url, "v1"), - model=settings.hf_model_name, + model=settings.model_name, openai_api_key="required-but-not-used", - temperature=settings.llm_temperature, - max_tokens=settings.llm_max_tokens, - # model_kwargs={ - # "top_p": settings.llm_top_p, - # "frequency_penalty": settings.llm_frequency_penalty, - # "presence_penalty": settings.llm_presence_penalty, - # # Additional parameters supported by vLLM but not OpenAI API - # # https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters - # "extra_body": { - # "top_k": settings.llm_top_k, - # } - top_p=settings.llm_top_p, - frequency_penalty=settings.llm_frequency_penalty, - presence_penalty=settings.llm_presence_penalty, - # Additional parameters supported by vLLM but not OpenAI API - # https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters + temperature=settings.llm_params.temperature, + max_tokens=settings.llm_params.max_tokens, + top_p=settings.llm_params.top_p, + frequency_penalty=settings.llm_params.frequency_penalty, + presence_penalty=settings.llm_params.presence_penalty, extra_body={ - "top_k": settings.llm_top_k, + "top_k": settings.llm_params.top_k, }, streaming=True, ) @@ -67,13 +72,13 @@ def inference(latest_message, history): try: if INCLUDE_SYSTEM_PROMPT: - context = [SystemMessage(content=settings.hf_model_instruction)] + context = [SystemMessage(content=settings.model_instruction)] else: context = [] for i, (human, ai) in enumerate(history): if not INCLUDE_SYSTEM_PROMPT and i == 0: # Mimic system prompt by prepending it to first human message - human = f"{settings.hf_model_instruction}\n\n{human}" + human = f"{settings.model_instruction}\n\n{human}" context.append(HumanMessage(content=human)) context.append(AIMessage(content=(ai or ""))) context.append(HumanMessage(content=latest_message)) @@ -131,8 +136,7 @@ def inference(latest_message, history): # UI theming theme = gr.themes.Default(**settings.theme_params) -if settings.theme_background_colour: - theme.body_background_fill = settings.theme_background_colour +theme.set(**settings.theme_params_extended) def inference_wrapper(*args): @@ -153,7 +157,7 @@ def inference_wrapper(*args): # Build main chat interface -with gr.ChatInterface( +app = gr.ChatInterface( inference_wrapper, chatbot=gr.Chatbot( # Height of conversation window in CSS units (string) or pixels (int) @@ -167,7 +171,6 @@ def inference_wrapper(*args): scale=7, ), title=settings.page_title, - description=settings.page_description, retry_btn="Retry", undo_btn="Undo", clear_btn="Clear", @@ -175,16 +178,8 @@ def inference_wrapper(*args): theme=theme, css=settings.css_overrides, js=settings.custom_javascript, -) as app: - logger.debug("Gradio chat interface config: %s", app.config) - # For running locally in tilt dev setup - if len(sys.argv) > 2 and sys.argv[2] == "localhost": - app.launch() - # For running on cluster - else: - app.queue( - # Allow 10 concurrent requests to backend - # vLLM backend should be clever enough to - # batch these requests appropriately. - default_concurrency_limit=10, - ).launch(server_name="0.0.0.0") +) +logger.debug("Gradio chat interface config: %s", app.config) +app.queue( + default_concurrency_limit=10, +).launch(server_name=settings.host_address) diff --git a/web-apps/chat/defaults.yml b/web-apps/chat/defaults.yml new file mode 100644 index 0000000..83f0e46 --- /dev/null +++ b/web-apps/chat/defaults.yml @@ -0,0 +1,31 @@ + +model_name: +model_instruction: "You are a helpful and cheerful AI assistant. Please respond appropriately." +backend_url: +host_address: 0.0.0.0 + +page_title: Large Language Model + +# LLM request parameters +# See https://platform.openai.com/docs/api-reference/chat/create +# and https://docs.vllm.ai/en/v0.6.0/serving/openai_compatible_server.html#extra-parameters +llm_params: + max_tokens: + temperature: 0 + top_p: + top_k: + frequency_penalty: + presence_penalty: + +# Gradio theme constructor parameters (e.g. 'primary_hue') +# See https://www.gradio.app/guides/theming-guide +theme_params: {} + +# Gradio theme .set(...) parameters +# See https://www.gradio.app/guides/theming-guide#extending-themes-via-set +theme_params_extended: {} + +# Additional CSS and JS overrides +# See https://www.gradio.app/guides/custom-CSS-and-JS +css_overrides: +custom_javascript: diff --git a/web-apps/chat-interface/gradio-client-test.py b/web-apps/chat/gradio-client-test.py similarity index 100% rename from web-apps/chat-interface/gradio-client-test.py rename to web-apps/chat/gradio-client-test.py diff --git a/web-apps/chat-interface/requirements.txt b/web-apps/chat/requirements.txt similarity index 78% rename from web-apps/chat-interface/requirements.txt rename to web-apps/chat/requirements.txt index 3f34151..a82255b 100644 --- a/web-apps/chat-interface/requirements.txt +++ b/web-apps/chat/requirements.txt @@ -4,4 +4,4 @@ openai langchain langchain_openai pydantic -pydantic_settings +../utils diff --git a/web-apps/image-analysis/Dockerfile b/web-apps/image-analysis/Dockerfile new file mode 100644 index 0000000..5f858f8 --- /dev/null +++ b/web-apps/image-analysis/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.11-slim + +ARG DIR=image-analysis + +COPY $DIR/requirements.txt requirements.txt +COPY utils utils +RUN pip install --no-cache-dir -r requirements.txt + +COPY purge-google-fonts.sh purge-google-fonts.sh +RUN bash purge-google-fonts.sh + +WORKDIR /app + +COPY $DIR/*.py . + +COPY $DIR/defaults.yml . + +ENTRYPOINT ["python3", "app.py"] diff --git a/web-apps/image-analysis/app.py b/web-apps/image-analysis/app.py new file mode 100644 index 0000000..c60f412 --- /dev/null +++ b/web-apps/image-analysis/app.py @@ -0,0 +1,121 @@ +import base64 +import logging +import requests + +import gradio as gr + +from typing import List, Dict +from io import BytesIO +from PIL import Image +from pydantic import BaseModel, ConfigDict +from urllib.parse import urljoin + +from utils import load_settings, LLMParams + +logging.basicConfig() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class PromptExample(BaseModel): + image_url: str + prompt: str + + +class AppSettings(BaseModel): + # Basic config + host_address: str + backend_url: str + model_name: str + page_title: str + page_description: str + examples: List[PromptExample] + llm_params: LLMParams | None + # Theme customisation + theme_params: Dict[str, str | list] + theme_params_extended: Dict[str, str] + css_overrides: str | None + custom_javascript: str | None + # Error on typos and suppress warnings for fields with 'model_' prefix + model_config = ConfigDict(protected_namespaces=(), extra="forbid") + + +settings = AppSettings(**load_settings()) +logger.info(settings) + + +# TODO: Rewrite this to stream output? +def analyze_image(image_url, prompt): + try: + # Download the image + response = requests.get(image_url) + response.raise_for_status() + image = Image.open(BytesIO(response.content)) + + # Convert image to base64 + buffered = BytesIO() + image.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()).decode() + + # Prepare the payload for the vision model + payload = { + "model": settings.model_name, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{img_str}"}, + }, + ], + } + ], + **{k: v for k, v in settings.llm_params if k != "top_k" and v is not None}, + } + if settings.llm_params.top_k: + payload["extra_body"] = { + "top_k": settings.llm_params.top_k, + } + + # Make the API call to the vision model + headers = {"Content-Type": "application/json"} + response = requests.post( + urljoin(settings.backend_url, "/v1/chat/completions"), + json=payload, + headers=headers, + ) + response.raise_for_status() + + # Extract and return the model's response + result = response.json() + return result["choices"][0]["message"]["content"] + + except Exception as e: + return f"An error occurred: {str(e)}" + + +# UI theming +theme = gr.themes.Default(**settings.theme_params) +theme.set(**settings.theme_params_extended) + +# Set up the Gradio interface +app = gr.Interface( + fn=analyze_image, + inputs=[ + gr.Textbox(label="Image URL"), + gr.Textbox(label="Prompt/Question", elem_id="prompt", scale=2), + ], + outputs=gr.Textbox(label="Results"), + title=settings.page_title, + description=settings.page_description, + examples=[[ex.image_url, ex.prompt] for ex in settings.examples], + theme=theme, + css=settings.css_overrides, + js=settings.custom_javascript, + analytics_enabled=False, +) + +# Launch the interface +app.queue(default_concurrency_limit=10).launch(server_name=settings.host_address) diff --git a/web-apps/image-analysis/defaults.yml b/web-apps/image-analysis/defaults.yml new file mode 100644 index 0000000..21d233c --- /dev/null +++ b/web-apps/image-analysis/defaults.yml @@ -0,0 +1,38 @@ + +model_name: +backend_url: +host_address: 0.0.0.0 + +page_title: Image analysis with a vision model +page_description: This model can be used to analyse image files. + +# Example inputs to render in the UI +examples: + - image_url: https://www.myplace.de/sites/default/files/styles/blog_hero_bild_slideshow/public/blog/Platzprofessor-MyPlace-SelfStorage-Die-Stadt-als-Raum-der-Begegnung-H.jpg?itok=ibY2Hoy9 + prompt: Conduct a detailed image analysis and describe all parts of the image that you can identify. Count all occurrences of the entities, which you can identify. Make a guess about the provenance or location of the image. + +# LLM request parameters +# See https://platform.openai.com/docs/api-reference/chat/create +# and https://docs.vllm.ai/en/v0.6.0/serving/openai_compatible_server.html#extra-parameters +llm_params: + max_tokens: + temperature: + top_p: + # vLLM rejects requests with top_k parameter for + # most (all?) vision models so can't use it here + # top_k: + frequency_penalty: + presence_penalty: + +# Gradio theme constructor parameters (e.g. 'primary_hue') +# See https://www.gradio.app/guides/theming-guide +theme_params: {} + +# Gradio theme .set(...) parameters +# See https://www.gradio.app/guides/theming-guide#extending-themes-via-set +theme_params_extended: {} + +# Additional CSS and JS overrides +# See https://www.gradio.app/guides/custom-CSS-and-JS +css_overrides: +custom_javascript: diff --git a/web-apps/image-analysis/requirements.txt b/web-apps/image-analysis/requirements.txt new file mode 100644 index 0000000..006c6a9 --- /dev/null +++ b/web-apps/image-analysis/requirements.txt @@ -0,0 +1,6 @@ +pillow +requests +gradio<5 +gradio_client +pydantic +../utils diff --git a/web-apps/chat-interface/purge-google-fonts.sh b/web-apps/purge-google-fonts.sh similarity index 100% rename from web-apps/chat-interface/purge-google-fonts.sh rename to web-apps/purge-google-fonts.sh diff --git a/web-apps/run.sh b/web-apps/run.sh index 5baa0c6..877bf5d 100755 --- a/web-apps/run.sh +++ b/web-apps/run.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -IMAGE_TAG=azimuth-llm-$1 +IMAGE_TAG=ghcr.io/stackhpc/azimuth-llm-$1 error() { echo $1 @@ -18,4 +18,4 @@ else echo "Found local $IMAGE_TAG docker image" fi -docker run -p 7860:7860 $IMAGE_TAG +docker run --rm -v ./$1/overrides.yml:/etc/web-app/overrides.yml -p 7860:7860 $IMAGE_TAG diff --git a/web-apps/utils/setup.py b/web-apps/utils/setup.py new file mode 100644 index 0000000..515d709 --- /dev/null +++ b/web-apps/utils/setup.py @@ -0,0 +1,8 @@ +from setuptools import setup, find_packages + +setup( + name='web-app-utils', + version='0.0.1', + py_modules=["utils"], + requires=["pydantic"] +) diff --git a/web-apps/utils/utils.py b/web-apps/utils/utils.py new file mode 100644 index 0000000..de299f8 --- /dev/null +++ b/web-apps/utils/utils.py @@ -0,0 +1,93 @@ +##### +# Shared utility functions and models for re-use by multiple web apps +##### + +import logging +import pathlib +import yaml +from typing import Annotated +from pydantic import BaseModel, ConfigDict, PositiveInt, Field + +logging.basicConfig() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class LLMParams(BaseModel): + """ + Parameters for vLLM API requests. For details see + https://platform.openai.com/docs/api-reference/chat/create + https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#extra-parameters + """ + + max_tokens: PositiveInt | None = None + temperature: Annotated[float, Field(ge=0, le=2)] | None = None + top_p: Annotated[float, Field(gt=0, le=1)] | None = None + top_k: Annotated[int, Field(ge=-1)] | None = None + frequency_penalty: Annotated[float, Field(ge=-2, le=2)] | None = None + presence_penalty: Annotated[float, Field(ge=0 - 2, le=2)] | None = None + # Make sure we can't smuggle in extra request params / typos + model_config = ConfigDict(extra="forbid") + + +NAMESPACE_FILE_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + + +def get_k8s_namespace(): + try: + current_k8s_namespace = open(NAMESPACE_FILE_PATH).read() + return current_k8s_namespace + except FileNotFoundError as err: + return None + + +def api_address_in_cluster(): + k8s_ns = get_k8s_namespace() + if k8s_ns: + return f"http://llm-backend.{k8s_ns}.svc" + else: + logger.warning( + "Failed to determine k8s namespace from %s - assuming non-kubernetes environment.", + NAMESPACE_FILE_PATH, + ) + + +# Method for loading settings from files +def load_yaml(file_path: str) -> dict: + with open(file_path, "r") as file: + content = yaml.safe_load(file) or {} + return content + + +def load_settings() -> dict: + + defaults = load_yaml("./defaults.yml") + overrides = {} + # Path must match the one used in the Helm chart's + # app-config-map.yml template + path = pathlib.Path("/etc/web-app/overrides.yml") + if path.exists(): + overrides = load_yaml(path) + else: + # Allow local overrides for dev/testing + path = pathlib.Path("./overrides.yml") + if path.exists(): + overrides = load_yaml(path) + + # Sanity checks on settings + unused_overrides = [k for k in overrides.keys() if k not in defaults.keys()] + if unused_overrides: + logger.warning( + f"Overrides {unused_overrides} not part of default settings so may be ignored." + "Please check for typos" + ) + settings = {**defaults, **overrides} + if "backend_url" not in settings or not settings["backend_url"]: + # Try to detect in-cluster address + in_cluster_backend = api_address_in_cluster() + if not in_cluster_backend: + raise Exception( + "Backend URL must be provided in settings when running outside of Kubernetes." + ) + settings["backend_url"] = in_cluster_backend + return settings