From 15f97cbaa128fa9783bfdd64d4769521d748fe7c Mon Sep 17 00:00:00 2001 From: Dharaneeshwaran Ravichandran Date: Mon, 29 Sep 2025 17:55:18 +0530 Subject: [PATCH] Add helm chart for running benchmark tool --- config/charts/benchmark/.helmignore | 23 +++++ config/charts/benchmark/Chart.yaml | 9 ++ config/charts/benchmark/README.md | 51 ++++++++++ .../benchmark/templates/deployment.yaml | 92 +++++++++++++++++++ config/charts/benchmark/values.yaml | 22 +++++ config/manifests/benchmark/benchmark.yaml | 60 ------------ .../benchmark/model-server-service.yaml | 12 --- site-src/performance/benchmark/index.md | 39 +++----- .../benchmark/download-benchmark-results.bash | 6 +- 9 files changed, 213 insertions(+), 101 deletions(-) create mode 100644 config/charts/benchmark/.helmignore create mode 100644 config/charts/benchmark/Chart.yaml create mode 100644 config/charts/benchmark/README.md create mode 100644 config/charts/benchmark/templates/deployment.yaml create mode 100644 config/charts/benchmark/values.yaml delete mode 100644 config/manifests/benchmark/benchmark.yaml delete mode 100644 config/manifests/benchmark/model-server-service.yaml diff --git a/config/charts/benchmark/.helmignore b/config/charts/benchmark/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/config/charts/benchmark/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/config/charts/benchmark/Chart.yaml b/config/charts/benchmark/Chart.yaml new file mode 100644 index 000000000..d8ade8506 --- /dev/null +++ b/config/charts/benchmark/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: benchmark +description: A Helm chart for Running Benchmark tool + +type: application + +version: 0.0.0 + +appVersion: "0.0.0" diff --git a/config/charts/benchmark/README.md b/config/charts/benchmark/README.md new file mode 100644 index 000000000..437e1d395 --- /dev/null +++ b/config/charts/benchmark/README.md @@ -0,0 +1,51 @@ +# Benchmark + +A chart to deploy the benchmark tool on top of vLLM model server deployment done via [getting started guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/#getting-started-with-gateway-api-inference-extension) + + +## Install + +To install benchmark tool + +```txt +$ helm install benchmark-tool ./config/charts/benchmark \ + --set moderlServingEndpoint.mode=gateway \ + --set moderlServingEndpoint.name=inference-gateway \ + --set moderlServingEndpoint.namespace=default +``` + +## Uninstall + +Run the following command to uninstall the chart: + +```txt +$ helm uninstall benchmark-tool +``` + +## Configuration + +The following table list the configurable parameters of the chart. + +| **Parameter Name** | **Description** | +|---------------------------------------------|----------------------------------------------------------------------------------------------------| +| `benchmark.requestRates` | Comma separated list of number of requests per second. For each request rate benchmarking would be done against the vLLM deployment. | +| `benchmark.timeSeconds` | Number of prompts will be calculated following this forumula `requestRate * timeSeconds` for each requestRate. | +| `benchmark.maxNumPrompts` | Maximum number of prompts to process. Will be considered when `requestRates` is not set. | +| `benchmark.tokenizer` | Name or path of the tokenizer. | +| `benchmark.models` | Comma separated list of models to benchmark. | +| `benchmark.backend` | Model serving backend. Default: vllm | +| `benchmark.port` | Model serving backend server's port | +| `benchmark.inputLength` | Maximum number of input tokens for filtering the benchmark dataset. | +| `benchmark.outputLength` | Maximum number of output tokens for filtering the benchmark dataset. | +| `benchmark.filePrefix` | Prefix to use for benchmark result's output file . | +| `benchmark.trafficSplit` | Comma-separated list of traffic split proportions for the models, e.g. '0.9,0.1'. Sum must equal 1.0. | +| `benchmark.scrapeServerMetrics` | Whether to scrape server metrics. | +| `benchmark.saveAggregatedResult` | Whether to aggregate results of all models and save the result. | +| `benchmark.streamRequest` | Whether to stream the request. Needed for TTFT metric | +| `benchmark.trafficSplit` | Comma-separated list of traffic split proportions for the models, e.g. '0.9,0.1'. Sum must equal 1.0. | +| `benchmark.trafficSplit` | Comma-separated list of traffic split proportions for the models, e.g. '0.9,0.1'. Sum must equal 1.0. | +| `benchmark.trafficSplit` | Comma-separated list of traffic split proportions for the models, e.g. '0.9,0.1'. Sum must equal 1.0. | +| `benchmark.trafficSplit` | Comma-separated list of traffic split proportions for the models, e.g. '0.9,0.1'. Sum must equal 1.0. | +| `moderlServingEndpoint.mode` | Mode in which you want the LPG tool to consume the model serving endpoint for benchmarking. Options are gateway or service | +| `moderlServingEndpoint.name` | Provide model serving endpoint's resource name. i.e. name of inference gateway or load balancer service name | +| `moderlServingEndpoint.namespace` | Namespace of moderlServingEndpoint resource. i.e. namespace of inference gateway or load balancer service name | diff --git a/config/charts/benchmark/templates/deployment.yaml b/config/charts/benchmark/templates/deployment.yaml new file mode 100644 index 000000000..62a781e85 --- /dev/null +++ b/config/charts/benchmark/templates/deployment.yaml @@ -0,0 +1,92 @@ +{{- $targetIP := "" -}} +{{- if eq .Values.moderlServingEndpoint.mode "gateway" -}} + {{- $gw := lookup "gateway.networking.k8s.io/v1" "Gateway" .Values.moderlServingEndpoint.namespace .Values.moderlServingEndpoint.name -}} + {{- if not $gw }} + {{- fail "Gateway .Values.moderlServingEndpoint.name not found in namespace .Values.moderlServingEndpoint.namespace. Please create it before installing this chart." -}} + {{- end }} + {{- if or (not $gw.status) (not $gw.status.addresses) -}} + {{- fail "Gateway .Values.moderlServingEndpoint.name found, but .status.addresses is not populated yet. Please wait until an IP is assigned." -}} + {{- end }} + {{- $targetIP = (index $gw.status.addresses 0).value | quote -}} +{{- end }} +{{- if eq .Values.moderlServingEndpoint.mode "service" -}} + {{- $svc := lookup "v1" "Service" .Values.moderlServingEndpoint.namespace .Values.moderlServingEndpoint.name -}} + {{- if not $svc }} + {{- fail "Service .Values.moderlServingEndpoint.name not found in namespace .Values.moderlServingEndpoint.namespace. Please create it before installing this chart." -}} + {{- end }} + {{- if or (not $svc.status) (not $svc.status.loadBalancer) -}} + {{- fail "Service .Values.moderlServingEndpoint.name found, but .status.loadBalancer is not populated yet. Please wait until an IP is assigned." -}} + {{- end }} + {{- $targetIP = (index $svc.status.loadBalancer.ingress 0).ip | quote -}} +{{- end }} + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: {{ .Release.Name }} + name: {{ .Release.Name }} +spec: + replicas: 1 + selector: + matchLabels: + app: {{ .Release.Name }} + template: + metadata: + labels: + app: {{ .Release.Name }} + spec: + containers: + # The following image was built from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/07628c9fe01b748f5a4cc9e5c2ee4234aaf47699 + - image: 'us-docker.pkg.dev/cloud-tpu-images/inference/inference-benchmark@sha256:1c100b0cc949c7df7a2db814ae349c790f034b4b373aaad145e77e815e838438' + imagePullPolicy: Always + name: {{ .Release.Name }} + command: + - bash + - -c + - ./latency_throughput_curve.sh + env: + - name: IP + value: {{ $targetIP }} + - name: REQUEST_RATES + value: {{ .Values.benchmark.requestRates | quote }} + - name: BENCHMARK_TIME_SECONDS + value: {{ .Values.benchmark.timeSeconds | quote }} + - name: MAX_NUM_PROMPTS + value: {{ .Values.benchmark.maxNumPrompts | quote }} + - name: TOKENIZER + value: {{ .Values.benchmark.tokenizer | quote }} + - name: MODELS + value: {{ .Values.benchmark.models | quote }} + - name: BACKEND + value: {{ .Values.benchmark.backend | quote }} + - name: PORT + value: {{ .Values.benchmark.port | quote }} + - name: INPUT_LENGTH + value: {{ .Values.benchmark.inputLength | quote }} + - name: OUTPUT_LENGTH + value: {{ .Values.benchmark.outputLength | quote }} + - name: FILE_PREFIX + value: {{ .Values.benchmark.filePrefix | quote }} + - name: PROMPT_DATASET_FILE + value: ShareGPT_V3_unfiltered_cleaned_split.json + - name: TRAFFIC_SPLIT + value: {{ .Values.benchmark.trafficSplit | quote }} + - name: SCRAPE_SERVER_METRICS + value: {{ .Values.benchmark.scrapeServerMetrics | quote }} + - name: SAVE_AGGREGATION_RESULT + value: {{ .Values.benchmark.saveAggregatedResult | quote }} + - name: STREAM_REQUEST + value: {{ .Values.benchmark.streamRequest | quote }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: token + name: hf-token + resources: + limits: + cpu: "2" + memory: 20Gi + requests: + cpu: "2" + memory: 20Gi diff --git a/config/charts/benchmark/values.yaml b/config/charts/benchmark/values.yaml new file mode 100644 index 000000000..c7e14fbf0 --- /dev/null +++ b/config/charts/benchmark/values.yaml @@ -0,0 +1,22 @@ +benchmark: + requestRates: "10,20,30" + timeSeconds: 60 + maxNumPrompts: + tokenizer: "meta-llama/Llama-3.1-8B-Instruct" + models: "meta-llama/Llama-3.1-8B-Instruct" + backend: "vllm" + port: 80 + inputLength: 1024 + outputLength: 2048 + filePrefix: "benchmark" + trafficSplit: + scrapeServerMetrics: + saveAggregatedResult: + streamRequest: +moderlServingEndpoint: + # `gateway` to select endpoint from inferenceGateway + # `service` to select endpoint from LoadBalancer service created on top of vLLM model server deployment + mode: gateway + name: vllm-llama3-8b-instruct + namespace: default + \ No newline at end of file diff --git a/config/manifests/benchmark/benchmark.yaml b/config/manifests/benchmark/benchmark.yaml deleted file mode 100644 index abf9ae5f6..000000000 --- a/config/manifests/benchmark/benchmark.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app: benchmark-tool - name: benchmark-tool -spec: - replicas: 1 - selector: - matchLabels: - app: benchmark-tool - template: - metadata: - labels: - app: benchmark-tool - spec: - containers: - # The following image was built from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/07628c9fe01b748f5a4cc9e5c2ee4234aaf47699 - - image: 'us-docker.pkg.dev/cloud-tpu-images/inference/inference-benchmark@sha256:1c100b0cc949c7df7a2db814ae349c790f034b4b373aaad145e77e815e838438' - imagePullPolicy: Always - name: benchmark-tool - command: - - bash - - -c - - ./latency_throughput_curve.sh - env: - - name: IP - value: '' - - name: REQUEST_RATES - value: '10,20,30' - - name: BENCHMARK_TIME_SECONDS - value: '60' - - name: TOKENIZER - value: 'meta-llama/Llama-3.1-8B-Instruct' - - name: MODELS - value: 'meta-llama/Llama-3.1-8B-Instruct' - - name: BACKEND - value: vllm - - name: PORT - value: "80" - - name: INPUT_LENGTH - value: "1024" - - name: OUTPUT_LENGTH - value: '2048' - - name: FILE_PREFIX - value: benchmark - - name: PROMPT_DATASET_FILE - value: ShareGPT_V3_unfiltered_cleaned_split.json - - name: HF_TOKEN - valueFrom: - secretKeyRef: - key: token - name: hf-token - resources: - limits: - cpu: "2" - memory: 20Gi - requests: - cpu: "2" - memory: 20Gi diff --git a/config/manifests/benchmark/model-server-service.yaml b/config/manifests/benchmark/model-server-service.yaml deleted file mode 100644 index 014054cf8..000000000 --- a/config/manifests/benchmark/model-server-service.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: my-pool-service -spec: - ports: - - port: 8081 - protocol: TCP - targetPort: 8000 - selector: - app: my-pool - type: LoadBalancer diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md index 42d5e727b..d24da41c3 100644 --- a/site-src/performance/benchmark/index.md +++ b/site-src/performance/benchmark/index.md @@ -36,41 +36,28 @@ The LPG benchmark tool works by sending traffic to the specified target IP and p Follow the steps below to run a single benchmark. Multiple LPG instances can be deployed to run benchmarks in parallel against different targets. -1. Check out the repo. - +1. Install the LPG benchmark tool by running the below helm chart. ```bash - git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension - cd gateway-api-inference-extension + export BENCHMARK_DEPLOYMENT_NAME=benchmark-tool + helm install $BENCHMARK_DEPLOYMENT_NAME \ + --set moderlServingEndpoint.mode=service \ + --set moderlServingEndpoint.name=vllm-llama3-8b-instruct\ + --set moderlServingEndpoint.namespace=default \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/benchmark ``` -1. Get the target IP. The examples below shows how to get the IP of a gateway or a k8s service. +## Download the results +1. Check out the repo to use the tools available to download and analyse the benchmark results ```bash - # Get gateway IP - GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') - # Get LoadBalancer k8s service IP - SVC_IP=$(kubectl get service/vllm-llama2-7b -o jsonpath='{.status.loadBalancer.ingress[0].ip}') - - echo $GW_IP - echo $SVC_IP - ``` - -1. Then update the `` in `./config/manifests/benchmark/benchmark.yaml` to the value of `$SVC_IP` or `$GW_IP`. - Feel free to adjust other parameters such as `request_rates` as well. For a complete list of LPG configurations, refer to the - [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark). - -1. Start the benchmark tool. - - ```bash - kubectl apply -f ./config/manifests/benchmark/benchmark.yaml + git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension + cd gateway-api-inference-extension ``` -1. Wait for benchmark to finish and download the results. Use the `benchmark_id` environment variable to specify what this - benchmark is for. For instance, `inference-extension` or `k8s-svc`. When the LPG tool finishes benchmarking, it will print - a log line `LPG_FINISHED`. The script below will watch for that log line and then start downloading results. +1. When the LPG tool finishes benchmarking, it will print a log line `LPG_FINISHED`. The script below will watch for that log line and then start downloading results. Use the `benchmark_id` environment variable to specify what this benchmark is for. For instance, `inference-extension` or `k8s-svc`. Use `BENCHMARK_DEPLOYMENT_NAME` environment variable to specify the deployment name used in previous step to install the LPG benchmark helm chart to download the results from respective deployment. ```bash - benchmark_id='k8s-svc' ./tools/benchmark/download-benchmark-results.bash + benchmark_id='k8s-svc' BENCHMARK_DEPLOYMENT_NAME=benchmark-tool ./tools/benchmark/download-benchmark-results.bash ``` After the script finishes, you should see benchmark results under `./tools/benchmark/output/default-run/k8s-svc/results/json` folder. diff --git a/tools/benchmark/download-benchmark-results.bash b/tools/benchmark/download-benchmark-results.bash index 6b9ca5057..0b167045c 100755 --- a/tools/benchmark/download-benchmark-results.bash +++ b/tools/benchmark/download-benchmark-results.bash @@ -2,8 +2,8 @@ # Downloads the benchmark result files from the benchmark tool pod. download_benchmark_results() { - until echo $(kubectl logs deployment/benchmark-tool -n ${namespace}) | grep -q -m 1 "LPG_FINISHED"; do sleep 30 ; done; - benchmark_pod=$(kubectl get pods -l app=benchmark-tool -n ${namespace} -o jsonpath="{.items[0].metadata.name}") + until echo $(kubectl logs deployment/$BENCHMARK_DEPLOYMENT_NAME -n ${namespace}) | grep -q -m 1 "LPG_FINISHED"; do sleep 30 ; done; + benchmark_pod=$(kubectl get pods -l app=$BENCHMARK_DEPLOYMENT_NAME -n ${namespace} -o jsonpath="{.items[0].metadata.name}") echo "Downloading JSON results from pod ${benchmark_pod}" kubectl exec ${benchmark_pod} -n ${namespace} -- rm -f ShareGPT_V3_unfiltered_cleaned_split.json for f in $(kubectl exec ${benchmark_pod} -n ${namespace} -- /bin/sh -c ls -l | grep json); do @@ -27,4 +27,4 @@ benchmark_output_dir=${SCRIPT_DIR}/${output_dir}/${run_id}/${benchmark_id} echo "Saving benchmark results to ${benchmark_output_dir}/results/json/" download_benchmark_results -kubectl delete -f ${SCRIPT_DIR}/../../config/manifests/benchmark/benchmark.yaml \ No newline at end of file +helm uninstall $BENCHMARK_DEPLOYMENT_NAME