Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 0 additions & 186 deletions config/manifests/inferencepool-resources.yaml

This file was deleted.

10 changes: 3 additions & 7 deletions hack/release-quickstart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,25 +74,21 @@ sed -i.bak "s|kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-in
# -----------------------------------------------------------------------------
# Update image references
# -----------------------------------------------------------------------------
EPP="config/manifests/inferencepool-resources.yaml"
#TODO: Put all helm values files into an array to loop over
EPP_HELM="config/charts/inferencepool/values.yaml"
BBR_HELM="config/charts/body-based-routing/values.yaml"
CONFORMANCE_MANIFESTS="conformance/resources/base.yaml"
echo "Updating ${EPP}, ${EPP_HELM}, ${BBR_HELM}, and ${CONFORMANCE_MANIFESTS} ..."
echo "Updating ${EPP_HELM}, ${BBR_HELM}, and ${CONFORMANCE_MANIFESTS} ..."

# Update the container tag.
sed -i.bak -E "s|(us-central1-docker\.pkg\.dev/k8s-staging-images/gateway-api-inference-extension/epp:)[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EPP"
sed -i.bak -E "s|(tag: )[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EPP_HELM"
sed -i.bak -E "s|(tag: )[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$BBR_HELM"
sed -i.bak -E "s|(us-central1-docker\.pkg\.dev/k8s-staging-images/gateway-api-inference-extension/epp:)[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$CONFORMANCE_MANIFESTS"

# Update the container image pull policy.
sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inference-extension\/epp/{n;s/Always/IfNotPresent/;}' "$EPP"
sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inference-extension\/epp/{n;s/Always/IfNotPresent/;}' "$CONFORMANCE_MANIFESTS"

# Update the container registry.
sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EPP"
sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EPP_HELM"
sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$BBR_HELM"
sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$CONFORMANCE_MANIFESTS"
Expand Down Expand Up @@ -139,8 +135,8 @@ sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io
# -----------------------------------------------------------------------------
# Stage the changes
# -----------------------------------------------------------------------------
echo "Staging $VERSION_FILE $UPDATED_CRD $README $EPP $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY files..."
git add $VERSION_FILE $UPDATED_CRD $README $EPP $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY
echo "Staging $VERSION_FILE $UPDATED_CRD $README $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY files..."
git add $VERSION_FILE $UPDATED_CRD $README $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY

# -----------------------------------------------------------------------------
# Cleanup backup files and finish
Expand Down
39 changes: 17 additions & 22 deletions site-src/implementations/model-servers.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,34 +19,29 @@ vLLM is configured as the default in the [endpoint picker extension](https://git

Triton specific metric names need to be specified when starting the EPP.

### Option 1: Use Helm
Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install the `inferencepool` via helm. See the [`inferencepool` helm guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/charts/inferencepool/README.md) for more details.

Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install the [`inferencepool` via helm](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool). See the [`inferencepool` helm guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool/README.md) for more details.
Add the following to the `flags` in the helm chart as [flags to EPP](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/29ea29028496a638b162ff287c62c0087211bbe5/config/charts/inferencepool/values.yaml#L36)

### Option 2: Edit EPP deployment yaml

Add the following to the `args` of the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/manifests/inferencepool-resources.yaml#L32)

```
- --total-queued-requests-metric
- "nv_trt_llm_request_metrics{request_type=waiting}"
- --kv-cache-usage-percentage-metric
- "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}"
- --lora-info-metric
- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet.
```
- name=total-queued-requests-metric
value="nv_trt_llm_request_metrics{request_type=waiting}"
- name=kv-cache-usage-percentage-metric
value="nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}"
- name=lora-info-metric
value="" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet.
```

## SGLang

### Edit EPP deployment yaml
Add the following `flags` while deploying using helm charts in the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/29ea29028496a638b162ff287c62c0087211bbe5/config/charts/inferencepool/values.yaml#L36)

Add the following to the `args` of the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/manifests/inferencepool-resources.yaml#L32)

```
- --totalQueuedRequestsMetric
- "sglang:num_queue_reqs"
- --kvCacheUsagePercentageMetric
- "sglang:token_usage"
- --lora-info-metric
- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by SGLang yet.
```
- name=total-queued-requests-metric
value="sglang:num_queue_reqs"
- name=kv-cache-usage-percentage-metric
value="sglang:token_usage"
- name=lora-info-metric
value="" # Set an empty metric to disable LoRA metric scraping as they are not supported by SGLang yet.
```