diff --git a/config/manifests/inferencepool-resources.yaml b/config/manifests/inferencepool-resources.yaml deleted file mode 100644 index ffe19654b..000000000 --- a/config/manifests/inferencepool-resources.yaml +++ /dev/null @@ -1,186 +0,0 @@ -# Note: If you change this file, please also change: -# - ./test/testdata/inferencepool-e2e.yaml -# - ./conformance/resources/manifests/manifests.yaml -# - ./site-src/guides/inferencepool-rollout.md ---- -apiVersion: inference.networking.k8s.io/v1 -kind: InferencePool -metadata: - name: vllm-llama3-8b-instruct -spec: - targetPorts: - - number: 8000 - selector: - matchLabels: - app: vllm-llama3-8b-instruct - endpointPickerRef: - name: vllm-llama3-8b-instruct-epp - kind: Service - port: - number: 9002 ---- -apiVersion: v1 -kind: Service -metadata: - name: vllm-llama3-8b-instruct-epp - namespace: default -spec: - selector: - app: vllm-llama3-8b-instruct-epp - ports: - - protocol: TCP - port: 9002 - targetPort: 9002 - appProtocol: http2 - type: ClusterIP ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: vllm-llama3-8b-instruct-epp - namespace: default ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-llama3-8b-instruct-epp - namespace: default - labels: - app: vllm-llama3-8b-instruct-epp -spec: - replicas: 1 - selector: - matchLabels: - app: vllm-llama3-8b-instruct-epp - template: - metadata: - labels: - app: vllm-llama3-8b-instruct-epp - spec: - serviceAccountName: vllm-llama3-8b-instruct-epp - # Conservatively, this timeout should mirror the longest grace period of the pods within the pool - terminationGracePeriodSeconds: 130 - containers: - - name: epp - image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main - imagePullPolicy: Always - args: - - --pool-name - - "vllm-llama3-8b-instruct" - - "--pool-namespace" - - "default" - - --v - - "4" - - --zap-encoder - - "json" - - --grpc-port - - "9002" - - --grpc-health-port - - "9003" - - "--config-file" - - "/config/default-plugins.yaml" - ports: - - containerPort: 9002 - - containerPort: 9003 - - name: metrics - containerPort: 9090 - livenessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 - readinessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 - volumeMounts: - - name: plugins-config-volume - mountPath: "/config" - volumes: - - name: plugins-config-volume - configMap: - name: plugins-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: plugins-config - namespace: default -data: - default-plugins.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: queue-scorer - - type: kv-cache-utilization-scorer - - type: prefix-cache-scorer - schedulingProfiles: - - name: default - plugins: - - pluginRef: queue-scorer - - pluginRef: kv-cache-utilization-scorer - - pluginRef: prefix-cache-scorer ---- -kind: Role -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-read - namespace: default -rules: -- apiGroups: [ "inference.networking.x-k8s.io" ] - resources: [ "inferenceobjectives", "inferencepools" ] - verbs: [ "get", "watch", "list" ] -- apiGroups: [ "inference.networking.k8s.io" ] - resources: [ "inferencepools" ] - verbs: [ "get", "watch", "list" ] -- apiGroups: [ "" ] - resources: [ "pods" ] - verbs: [ "get", "watch", "list" ] ---- -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-read-binding - namespace: default -subjects: -- kind: ServiceAccount - name: vllm-llama3-8b-instruct-epp - namespace: default -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: pod-read ---- -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: auth-reviewer -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: auth-reviewer-binding -subjects: -- kind: ServiceAccount - name: vllm-llama3-8b-instruct-epp - namespace: default -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: auth-reviewer diff --git a/hack/release-quickstart.sh b/hack/release-quickstart.sh index 22c705184..04b79a3ef 100755 --- a/hack/release-quickstart.sh +++ b/hack/release-quickstart.sh @@ -74,25 +74,21 @@ sed -i.bak "s|kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-in # ----------------------------------------------------------------------------- # Update image references # ----------------------------------------------------------------------------- -EPP="config/manifests/inferencepool-resources.yaml" #TODO: Put all helm values files into an array to loop over EPP_HELM="config/charts/inferencepool/values.yaml" BBR_HELM="config/charts/body-based-routing/values.yaml" CONFORMANCE_MANIFESTS="conformance/resources/base.yaml" -echo "Updating ${EPP}, ${EPP_HELM}, ${BBR_HELM}, and ${CONFORMANCE_MANIFESTS} ..." +echo "Updating ${EPP_HELM}, ${BBR_HELM}, and ${CONFORMANCE_MANIFESTS} ..." # Update the container tag. -sed -i.bak -E "s|(us-central1-docker\.pkg\.dev/k8s-staging-images/gateway-api-inference-extension/epp:)[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EPP" sed -i.bak -E "s|(tag: )[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EPP_HELM" sed -i.bak -E "s|(tag: )[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$BBR_HELM" sed -i.bak -E "s|(us-central1-docker\.pkg\.dev/k8s-staging-images/gateway-api-inference-extension/epp:)[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$CONFORMANCE_MANIFESTS" # Update the container image pull policy. -sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inference-extension\/epp/{n;s/Always/IfNotPresent/;}' "$EPP" sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inference-extension\/epp/{n;s/Always/IfNotPresent/;}' "$CONFORMANCE_MANIFESTS" # Update the container registry. -sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EPP" sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EPP_HELM" sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$BBR_HELM" sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$CONFORMANCE_MANIFESTS" @@ -139,8 +135,8 @@ sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io # ----------------------------------------------------------------------------- # Stage the changes # ----------------------------------------------------------------------------- -echo "Staging $VERSION_FILE $UPDATED_CRD $README $EPP $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY files..." -git add $VERSION_FILE $UPDATED_CRD $README $EPP $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY +echo "Staging $VERSION_FILE $UPDATED_CRD $README $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY files..." +git add $VERSION_FILE $UPDATED_CRD $README $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY # ----------------------------------------------------------------------------- # Cleanup backup files and finish diff --git a/site-src/implementations/model-servers.md b/site-src/implementations/model-servers.md index da9968fad..ed57e1252 100644 --- a/site-src/implementations/model-servers.md +++ b/site-src/implementations/model-servers.md @@ -19,34 +19,29 @@ vLLM is configured as the default in the [endpoint picker extension](https://git Triton specific metric names need to be specified when starting the EPP. -### Option 1: Use Helm +Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install the `inferencepool` via helm. See the [`inferencepool` helm guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/charts/inferencepool/README.md) for more details. -Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install the [`inferencepool` via helm](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool). See the [`inferencepool` helm guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool/README.md) for more details. + Add the following to the `flags` in the helm chart as [flags to EPP](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/29ea29028496a638b162ff287c62c0087211bbe5/config/charts/inferencepool/values.yaml#L36) -### Option 2: Edit EPP deployment yaml - - Add the following to the `args` of the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/manifests/inferencepool-resources.yaml#L32) - - ``` -- --total-queued-requests-metric -- "nv_trt_llm_request_metrics{request_type=waiting}" -- --kv-cache-usage-percentage-metric -- "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" -- --lora-info-metric -- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. +``` +- name=total-queued-requests-metric + value="nv_trt_llm_request_metrics{request_type=waiting}" +- name=kv-cache-usage-percentage-metric + value="nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" +- name=lora-info-metric + value="" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. ``` ## SGLang -### Edit EPP deployment yaml + Add the following `flags` while deploying using helm charts in the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/29ea29028496a638b162ff287c62c0087211bbe5/config/charts/inferencepool/values.yaml#L36) - Add the following to the `args` of the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/manifests/inferencepool-resources.yaml#L32) ``` -- --totalQueuedRequestsMetric -- "sglang:num_queue_reqs" -- --kvCacheUsagePercentageMetric -- "sglang:token_usage" -- --lora-info-metric -- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by SGLang yet. -``` +- name=total-queued-requests-metric + value="sglang:num_queue_reqs" +- name=kv-cache-usage-percentage-metric + value="sglang:token_usage" +- name=lora-info-metric + value="" # Set an empty metric to disable LoRA metric scraping as they are not supported by SGLang yet. +``` \ No newline at end of file