kubernetes-sigs · dharaneeshvrd · Sep 29, 2025
diff --git a/config/charts/benchmark/.helmignore b/config/charts/benchmark/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/config/charts/benchmark/Chart.yaml b/config/charts/benchmark/Chart.yaml
@@ -0,0 +1,9 @@
+apiVersion: v2
+name: benchmark
+description: A Helm chart for Running Benchmark tool
+
+type: application
+
+version: 0.0.0
+
+appVersion: "0.0.0"
diff --git a/config/charts/benchmark/README.md b/config/charts/benchmark/README.md
@@ -0,0 +1,51 @@
+# Benchmark
+
+A chart to deploy the benchmark tool on top of vLLM model server deployment done via [getting started guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/#getting-started-with-gateway-api-inference-extension)
+
+
+## Install
+
+To install benchmark tool 
+
+```txt
+$ helm install benchmark-tool ./config/charts/benchmark \
+    --set moderlServingEndpoint.mode=gateway \
+    --set moderlServingEndpoint.name=inference-gateway \
+    --set moderlServingEndpoint.namespace=default
+```
+
+## Uninstall
+
+Run the following command to uninstall the chart:
+
+```txt
+$ helm uninstall benchmark-tool
+```
+
+## Configuration
+
+The following table list the configurable parameters of the chart.
+
+| **Parameter Name**                          | **Description**                                                                                    |
+|---------------------------------------------|----------------------------------------------------------------------------------------------------|
+| `benchmark.requestRates`                    | Comma separated list of number of requests per second. For each request rate benchmarking would be done against the vLLM deployment.                                             |
+| `benchmark.timeSeconds`                     | Number of prompts will be calculated following this forumula `requestRate * timeSeconds` for each requestRate.                                                                  |
+| `benchmark.maxNumPrompts`                   | Maximum number of prompts to process. Will be considered when `requestRates` is not set.                                                                                                |
+| `benchmark.tokenizer`                       | Name or path of the tokenizer.                                                                                          |
+| `benchmark.models`                          | Comma separated list of models to benchmark.                                                                                          |
+| `benchmark.backend`                         | Model serving backend. Default: vllm                                                                                                |
+| `benchmark.port`                            | Model serving backend server's port                                                                                                | 
+| `benchmark.inputLength`                     | Maximum number of input tokens for filtering the benchmark dataset.                                                                                            |
+| `benchmark.outputLength`                    | Maximum number of output tokens for filtering the benchmark dataset. |
+| `benchmark.filePrefix`                      | Prefix to use for benchmark result's output file .  |
+| `benchmark.trafficSplit`                    | Comma-separated list of traffic split proportions for the models, e.g. '0.9,0.1'. Sum must equal 1.0.                                                                                 |
+| `benchmark.scrapeServerMetrics`             | Whether to scrape server metrics.                                                                                            |
+| `benchmark.saveAggregatedResult`            | Whether to aggregate results of all models and save the result.                                                                                             |
+| `benchmark.streamRequest`                   | Whether to stream the request. Needed for TTFT metric                                                                                              |
+| `benchmark.trafficSplit`                    | Comma-separated list of traffic split proportions for the models, e.g. '0.9,0.1'. Sum must equal 1.0.                                                                                 |
+| `benchmark.trafficSplit`                    | Comma-separated list of traffic split proportions for the models, e.g. '0.9,0.1'. Sum must equal 1.0.                                                                                 |
+| `benchmark.trafficSplit`                    | Comma-separated list of traffic split proportions for the models, e.g. '0.9,0.1'. Sum must equal 1.0.                                                                                 |
+| `benchmark.trafficSplit`                    | Comma-separated list of traffic split proportions for the models, e.g. '0.9,0.1'. Sum must equal 1.0.                                                                                 |
+| `moderlServingEndpoint.mode`                | Mode in which you want the LPG tool to consume the model serving endpoint for benchmarking. Options are gateway or service                                                                      |                        
+| `moderlServingEndpoint.name`                | Provide model serving endpoint's resource name. i.e. name of inference gateway or load balancer service name                                                                          |                        
+| `moderlServingEndpoint.namespace`           | Namespace of  moderlServingEndpoint resource. i.e. namespace of inference gateway or load balancer service name                                                                          |                        
diff --git a/config/charts/benchmark/templates/deployment.yaml b/config/charts/benchmark/templates/deployment.yaml
@@ -0,0 +1,92 @@
+{{- $targetIP := "" -}}
+{{- if eq .Values.moderlServingEndpoint.mode "gateway" -}}
+  {{- $gw := lookup "gateway.networking.k8s.io/v1" "Gateway" .Values.moderlServingEndpoint.namespace .Values.moderlServingEndpoint.name -}}
+  {{- if not $gw }}
+    {{- fail "Gateway .Values.moderlServingEndpoint.name not found in namespace .Values.moderlServingEndpoint.namespace. Please create it before installing this chart." -}}
+  {{- end }}
+  {{- if or (not $gw.status) (not $gw.status.addresses) -}}
+    {{- fail "Gateway .Values.moderlServingEndpoint.name found, but .status.addresses is not populated yet. Please wait until an IP is assigned." -}}
+  {{- end }}
+  {{- $targetIP = (index $gw.status.addresses 0).value | quote -}}
+{{- end }}
+{{- if eq .Values.moderlServingEndpoint.mode "service" -}}
+  {{- $svc := lookup "v1" "Service" .Values.moderlServingEndpoint.namespace .Values.moderlServingEndpoint.name -}}
+  {{- if not $svc }}
+    {{- fail "Service .Values.moderlServingEndpoint.name not found in namespace .Values.moderlServingEndpoint.namespace. Please create it before installing this chart." -}}
+  {{- end }}
+  {{- if or (not $svc.status) (not $svc.status.loadBalancer) -}}
+    {{- fail "Service .Values.moderlServingEndpoint.name found, but .status.loadBalancer is not populated yet. Please wait until an IP is assigned." -}}
+  {{- end }}
+  {{- $targetIP = (index $svc.status.loadBalancer.ingress 0).ip | quote -}}
+{{- end }}
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: {{ .Release.Name }}
+  name: {{ .Release.Name }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: {{ .Release.Name }}
+  template:
+    metadata:
+      labels:
+        app: {{ .Release.Name }}
+    spec:
+      containers:
+        # The following image was built from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/07628c9fe01b748f5a4cc9e5c2ee4234aaf47699
+      - image: 'us-docker.pkg.dev/cloud-tpu-images/inference/inference-benchmark@sha256:1c100b0cc949c7df7a2db814ae349c790f034b4b373aaad145e77e815e838438'
+        imagePullPolicy: Always
+        name: {{ .Release.Name }}
+        command:
+        - bash
+        - -c
+        - ./latency_throughput_curve.sh
+        env:
+        - name: IP
+          value: {{ $targetIP }}
+        - name: REQUEST_RATES
+          value: {{ .Values.benchmark.requestRates | quote }}
+        - name: BENCHMARK_TIME_SECONDS
+          value: {{ .Values.benchmark.timeSeconds | quote }}
+        - name: MAX_NUM_PROMPTS
+          value: {{ .Values.benchmark.maxNumPrompts | quote }}
+        - name: TOKENIZER
+          value: {{ .Values.benchmark.tokenizer | quote }}
+        - name: MODELS
+          value: {{ .Values.benchmark.models | quote }}
+        - name: BACKEND
+          value: {{ .Values.benchmark.backend | quote }}
+        - name: PORT
+          value: {{ .Values.benchmark.port | quote }}
+        - name: INPUT_LENGTH
+          value: {{ .Values.benchmark.inputLength | quote }}
+        - name: OUTPUT_LENGTH
+          value: {{ .Values.benchmark.outputLength | quote }}
+        - name: FILE_PREFIX
+          value: {{ .Values.benchmark.filePrefix | quote }}
+        - name: PROMPT_DATASET_FILE
+          value: ShareGPT_V3_unfiltered_cleaned_split.json
+        - name: TRAFFIC_SPLIT
+          value: {{ .Values.benchmark.trafficSplit | quote }}
+        - name: SCRAPE_SERVER_METRICS
+          value: {{ .Values.benchmark.scrapeServerMetrics | quote }}
+        - name: SAVE_AGGREGATION_RESULT
+          value: {{ .Values.benchmark.saveAggregatedResult | quote }} 
+        - name: STREAM_REQUEST
+          value: {{ .Values.benchmark.streamRequest | quote }}
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: token
+              name: hf-token
+        resources:
+          limits:
+            cpu: "2"
+            memory: 20Gi
+          requests:
+            cpu: "2"
+            memory: 20Gi
diff --git a/config/charts/benchmark/values.yaml b/config/charts/benchmark/values.yaml
@@ -0,0 +1,22 @@
+benchmark:
+  requestRates: "10,20,30"
+  timeSeconds: 60
+  maxNumPrompts: 
+  tokenizer: "meta-llama/Llama-3.1-8B-Instruct"
+  models: "meta-llama/Llama-3.1-8B-Instruct"
+  backend: "vllm"
+  port: 80
+  inputLength: 1024
+  outputLength: 2048
+  filePrefix: "benchmark"
+  trafficSplit: 
+  scrapeServerMetrics:
+  saveAggregatedResult:
+  streamRequest:
+moderlServingEndpoint:
+  # `gateway` to select endpoint from inferenceGateway
+  # `service` to select endpoint from LoadBalancer service created on top of vLLM model server deployment
+  mode: gateway
+  name: vllm-llama3-8b-instruct
+  namespace: default
+
diff --git a/config/manifests/benchmark/benchmark.yaml b/config/manifests/benchmark/benchmark.yaml
diff --git a/config/manifests/benchmark/model-server-service.yaml b/config/manifests/benchmark/model-server-service.yaml
diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md
@@ -36,41 +36,28 @@ The LPG benchmark tool works by sending traffic to the specified target IP and p
 Follow the steps below to run a single benchmark. Multiple LPG instances can be deployed to run benchmarks in
 parallel against different targets.
 
-1. Check out the repo.
-
+1. Install the LPG benchmark tool by running the below helm chart.
     ```bash
-    git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension
-    cd gateway-api-inference-extension
+    export BENCHMARK_DEPLOYMENT_NAME=benchmark-tool
+    helm install $BENCHMARK_DEPLOYMENT_NAME \
+    --set moderlServingEndpoint.mode=service \
+    --set moderlServingEndpoint.name=vllm-llama3-8b-instruct\
+    --set moderlServingEndpoint.namespace=default \
+    oci://registry.k8s.io/gateway-api-inference-extension/charts/benchmark
     ```
 
-1. Get the target IP. The examples below shows how to get the IP of a gateway or a k8s service.
+## Download the results
+1. Check out the repo to use the tools available to download and analyse the benchmark results
 
     ```bash
-    # Get gateway IP
-    GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
-    # Get LoadBalancer k8s service IP
-    SVC_IP=$(kubectl get service/vllm-llama2-7b -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
-
-    echo $GW_IP
-    echo $SVC_IP
-    ```
-
-1. Then update the `<target-ip>` in `./config/manifests/benchmark/benchmark.yaml` to the value of `$SVC_IP` or `$GW_IP`.
-   Feel free to adjust other parameters such as `request_rates` as well. For a complete list of LPG configurations, refer to the
-   [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark).
-
-1. Start the benchmark tool.
-
-    ```bash
-    kubectl apply -f ./config/manifests/benchmark/benchmark.yaml
+    git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension
+    cd gateway-api-inference-extension
     ```
 
-1. Wait for benchmark to finish and download the results. Use the `benchmark_id` environment variable to specify what this
-   benchmark is for. For instance, `inference-extension` or `k8s-svc`. When the LPG tool finishes benchmarking, it will print
-   a log line `LPG_FINISHED`. The script below will watch for that log line and then start downloading results.
+1. When the LPG tool finishes benchmarking, it will print a log line `LPG_FINISHED`. The script below will watch for that log line and then start downloading results. Use the `benchmark_id` environment variable to specify what this benchmark is for. For instance, `inference-extension` or `k8s-svc`. Use `BENCHMARK_DEPLOYMENT_NAME` environment variable to specify the deployment name used in previous step to install the LPG benchmark helm chart to download the results from respective deployment.
 
     ```bash
-    benchmark_id='k8s-svc' ./tools/benchmark/download-benchmark-results.bash
+    benchmark_id='k8s-svc' BENCHMARK_DEPLOYMENT_NAME=benchmark-tool ./tools/benchmark/download-benchmark-results.bash
     ```
 
     After the script finishes, you should see benchmark results under `./tools/benchmark/output/default-run/k8s-svc/results/json` folder.

diff --git a/tools/benchmark/download-benchmark-results.bash b/tools/benchmark/download-benchmark-results.bash
@@ -2,8 +2,8 @@
 
 # Downloads the benchmark result files from the benchmark tool pod.
 download_benchmark_results() {
-  until echo $(kubectl logs deployment/benchmark-tool -n ${namespace}) | grep -q -m 1 "LPG_FINISHED"; do sleep 30 ; done;
-      benchmark_pod=$(kubectl get pods -l app=benchmark-tool -n ${namespace} -o jsonpath="{.items[0].metadata.name}")
+  until echo $(kubectl logs deployment/$BENCHMARK_DEPLOYMENT_NAME -n ${namespace}) | grep -q -m 1 "LPG_FINISHED"; do sleep 30 ; done;
+      benchmark_pod=$(kubectl get pods -l app=$BENCHMARK_DEPLOYMENT_NAME -n ${namespace} -o jsonpath="{.items[0].metadata.name}")
       echo "Downloading JSON results from pod ${benchmark_pod}"
       kubectl exec ${benchmark_pod} -n ${namespace} -- rm -f ShareGPT_V3_unfiltered_cleaned_split.json
       for f in $(kubectl exec ${benchmark_pod} -n ${namespace} -- /bin/sh -c ls -l | grep json); do
@@ -27,4 +27,4 @@ benchmark_output_dir=${SCRIPT_DIR}/${output_dir}/${run_id}/${benchmark_id}
 
 echo "Saving benchmark results to ${benchmark_output_dir}/results/json/"
 download_benchmark_results
-kubectl delete -f ${SCRIPT_DIR}/../../config/manifests/benchmark/benchmark.yaml
+helm uninstall $BENCHMARK_DEPLOYMENT_NAME