diff --git a/.github/workflows/build-push-vllm-xpu.yml b/.github/workflows/build-push-vllm-xpu.yml new file mode 100644 index 0000000..5723333 --- /dev/null +++ b/.github/workflows/build-push-vllm-xpu.yml @@ -0,0 +1,43 @@ +name: Publish vLLM XPU images + +on: + # NOTE(sd109): Since this is checking out an external + # it's probably safer to leave this as workflow dispatch + # only so that we can manually build images from specific + # refs rather than automatically pulling in the latest + # content from the remote repo. + workflow_dispatch: + inputs: + vllm_ref: + type: string + description: The vLLM GitHub ref (tag, branch or commit) to build. + required: true + +jobs: + build_push_xpu_image: + name: Build and push image + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write # needed for signing the images with GitHub OIDC Token + packages: write # required for pushing container images + security-events: write # required for pushing SARIF files + steps: + - name: Check out the vLLM repository + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + ref: ${{ inputs.vllm_ref }} + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push image + run: | + IMAGE=ghcr.io/stackhpc/vllm-xpu:${{ inputs.vllm_ref }} + docker build -f docker/Dockerfile.xpu -t $IMAGE --shm-size=4g . + docker push $IMAGE diff --git a/charts/azimuth-llm/templates/api/deployment.yml b/charts/azimuth-llm/templates/api/deployment.yml index 850b0f1..0e6206d 100644 --- a/charts/azimuth-llm/templates/api/deployment.yml +++ b/charts/azimuth-llm/templates/api/deployment.yml @@ -19,8 +19,13 @@ spec: spec: containers: - name: {{ .Release.Name }}-api - {{ $imageRepo := .Values.api.image.repository | default (ternary "ghcr.io/stackhpc/vllm-cpu" "vllm/vllm-openai" (eq (.Values.api.gpus | int) 0)) -}} - image: {{ printf "%s:%s" $imageRepo .Values.api.image.version }} + {{- if eq (.Values.api.gpus | int) 0 }} + image: "ghcr.io/stackhpc/vllm-cpu:{{ .Values.api.image.version }}" + {{- else if .Values.api.intelXPUsEnabled }} + image: "ghcr.io/stackhpc/vllm-xpu:{{ .Values.api.image.version }}" + {{- else }} + image: "vllm/vllm-openai:{{ .Values.api.image.version }}" + {{- end }} ports: - name: api containerPort: 8000 @@ -61,7 +66,11 @@ spec: periodSeconds: 10 resources: limits: + {{- if .Values.api.intelXPUsEnabled }} + gpu.intel.com/i915: {{ .Values.api.gpus | int }} + {{- else }} nvidia.com/gpu: {{ .Values.api.gpus | int }} + {{- end }} volumes: - name: data {{- .Values.api.cacheVolume | toYaml | nindent 10 }} diff --git a/charts/azimuth-llm/values.yaml b/charts/azimuth-llm/values.yaml index a82b567..e42a316 100644 --- a/charts/azimuth-llm/values.yaml +++ b/charts/azimuth-llm/values.yaml @@ -33,8 +33,9 @@ api: enabled: true # Container image config image: - # Defaults to vllm/vllm-openai when api.gpus > 0 - # or ghrc.io/stackhpc/vllm-cpu when api.gpus == 0 + # Defaults to vllm/vllm-openai when api.gpus > 0, + # ghcr.io/stackhpc/vllm-xpu when api.gpus > 0 and intelXPUsEnabled is true, + # or ghcr.io/stackhpc/vllm-cpu when api.gpus == 0 repository: version: v0.8.5.post1 monitoring: @@ -80,6 +81,8 @@ api: # distributed / multi-GPU support should be available, though it # has not been tested against this app. gpus: 1 + # Whether pods should request Intel GPUs as opposed to the default Nvidia GPUs + intelXPUsEnabled: false # The update strategy to use for the deployment # See https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#updating-a-deployment # NOTE: The following RollingUpdate strategy offers a zero-downtime update but requires additional GPU worker nodes.