diff --git a/.github/workflows/update-dependencies.yml b/.github/workflows/update-dependencies.yml index 8c3bd83..b574b06 100644 --- a/.github/workflows/update-dependencies.yml +++ b/.github/workflows/update-dependencies.yml @@ -24,22 +24,26 @@ jobs: set -xe # Install dependency - apt update && apt install -y jq yq + sudo apt update + sudo apt install -y jq + sudo snap install yq # Tell git who we are for commits git config user.email "${{ github.actor }}" git config user.name "${{ github.actor }}" # Get latest vLLM release tag and replace it in various places - OLD_VLLM_TAG=$(yq '.api.image.version' chart/values.yml) - NEW_VLLM_TAG=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq .tag_name | sed s/\"//g) + CHART_VALUES=chart/values.yaml + # Export vars so that they can be used by yq's strenv function + export OLD_VLLM_TAG=$(yq '.api.image.version' $CHART_VALUES) + export NEW_VLLM_TAG=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq .tag_name | sed s/\"//g) if [[ $OLD_VLLM_TAG != $NEW_VLLM_TAG ]]; then # Set new release tag output echo new_vllm_tag=$NEW_VLLM_TAG >> $GITHUB_OUTPUT # Update yaml in-place with yq - yq e -i '.api.image.version = strenv(NEW_VLLM_TAG)' chart/values.yaml + yq e -i '.api.image.version = strenv(NEW_VLLM_TAG)' $CHART_VALUES # Can't use in-place editing with jq - jq --arg tag $NEW_VLLM_TAG '.properties.api.properties.image.properties.version.default = $tag' chart/values.schema.json.new + jq --indent 4 --arg tag $NEW_VLLM_TAG '.properties.api.properties.image.properties.version.default = $tag' chart/values.schema.json > chart/values.schema.json.new mv chart/values.schema.json{.new,} fi @@ -49,6 +53,6 @@ jobs: with: base: main branch: update/vllm-${{ steps.dependency_updates.outputs.new_vllm_tag }} - title: "Update dependencies" + title: "Update vLLM to ${{ steps.dependency_updates.outputs.new_vllm_tag }}" body: This PR was automatically generated by GitHub Actions. delete-branch: true diff --git a/chart/values.schema.json b/chart/values.schema.json index b227644..fd125bd 100644 --- a/chart/values.schema.json +++ b/chart/values.schema.json @@ -12,12 +12,17 @@ "default": "microsoft/Phi-3.5-mini-instruct" }, "token": { - "type": ["string", "null"], + "type": [ + "string", + "null" + ], "title": "Access Token", "description": "A HuggingFace [access token](https://huggingface.co/docs/hub/security-tokens). Required for [gated models](https://huggingface.co/docs/hub/en/models-gated) (e.g. Llama 3)." } }, - "required": ["model"] + "required": [ + "model" + ] }, "ui": { "type": "object", @@ -87,9 +92,11 @@ "minimum": -2, "maximum": 2 } - }, - "required": ["hf_model_name", "hf_model_instruction"] + "required": [ + "hf_model_name", + "hf_model_instruction" + ] } } }, @@ -107,7 +114,7 @@ "type": "string", "title": "Backend vLLM version", "description": "The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)", - "default": "v0.5.4" + "default": "v0.5.5" } } } diff --git a/chart/values.yaml b/chart/values.yaml index 626b265..a41ede2 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -15,7 +15,6 @@ huggingface: # repo files yet. This chart value provides a hook to manually apply the # correct chat template for such models. chatTemplate: - # For private/gated huggingface models (e.g. Meta's Llama models) # you must provide your own huggingface token, for details see: # https://huggingface.co/docs/hub/security-tokens @@ -29,7 +28,6 @@ huggingface: # OR FOR TESTING PURPOSES ONLY, you can instead provide the secret directly # as a chart value here (if secretName is set above then it will take priority) token: - # Configuration for the backend model serving API api: # Container image config @@ -51,13 +49,11 @@ api: iconUrl: https://raw.githubusercontent.com/vllm-project/vllm/v0.2.7/docs/source/assets/logos/vllm-logo-only-light.png description: | The raw inference API endpoints for the deployed LLM. - # Config for huggingface model cache volume # This is mounted at /root/.cache/huggingface in the api deployment cacheVolume: hostPath: path: /tmp/llm/huggingface-cache - # Number of gpus to requests for each api pod instance # NOTE: This must be in the range 1 <= value <= N, where # 'N' is the number of GPUs available in a single @@ -73,15 +69,12 @@ api: # to preform a rolling zero-downtime update updateStrategy: type: Recreate - # The value of the vLLM backend's max_model_len argument (if the model's default is not suitable) # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server modelMaxContextLength: - # Extra args to supply to the vLLM backend, see # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server extraArgs: [] - # Configuration for the frontend web interface ui: # Toggles installation of the gradio web UI @@ -124,7 +117,6 @@ ui: rollingUpdate: maxSurge: 25% maxUnavailable: 25% - # Settings for configuring ingress resources # to make the UI and/or backend API accessible # outside the cluster. @@ -155,6 +147,5 @@ ingress: # Annotations to apply to the ingress resource # e.g. for cert-manager integration annotations: - reloader: watchGlobally: false