kubernetes-sigs · rlakhtakia · Oct 6, 2025
diff --git a/benchmarking/precise-prefix-cache-aware/Chart.yaml b/benchmarking/precise-prefix-cache-aware/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+name: precise-prefix-cache-aware
+description: A Helm chart for precise-prefix-cache-aware benchmarking
+version: 0.1.0
+appVersion: "1.0"
diff --git a/benchmarking/precise-prefix-cache-aware/README.md b/benchmarking/precise-prefix-cache-aware/README.md
@@ -0,0 +1,94 @@
+# Precise Prefix Cache Aware Benchmarking Helm Chart
+
+This Helm chart deploys the `inference-perf` benchmarking tool with two distinct configurations: a high-cache scenario and a low-cache scenario. This chart specifically utilizes the **shared prefix dataset** for benchmarking. This guide will walk you through deploying both.
+
+## Prerequisites
+
+Before you begin, ensure you have the following:
+
+*   **Helm 3+**: [Installation Guide](https://helm.sh/docs/intro/install/)
+*   **Kubernetes Cluster**: Access to a Kubernetes cluster
+*   **Gateway Deployed**: Your inference server/gateway must be deployed and accessible within the cluster.
+
+
+**Hugging Face Token Secret**
+
+The benchmark requires a Hugging Face token to pull models. Create a Kubernetes Secret named `hf-token` (or a custom name you provide) in your target namespace, containing your Hugging Face token.
+
+    To create this secret:
+    ```bash
+    export _HF_TOKEN='<YOUR_HF_TOKEN>'
+    kubectl create secret generic hf-token --from-literal=token=$_HF_TOKEN
+    ```
+
+## Shared Prefix Dataset Configuration
+
+The chart uses the `shared_prefix` dataset type, which is designed to test caching efficiency. These parameters are located under config.data.shared_prefix:
+
+*   `num_groups`: The number of shared prefix groups.
+*   `num_prompts_per_group`: The number of prompts within each shared prefix group.
+*   `system_prompt_len`: The length of the system prompt.
+*   `question_len`: The length of the question part of the prompt.
+*   `output_len`: The desired length of the model's output.
+
+The default values for the dataset are defined in the chart, but you can override them using `--set config.data.shared_prefix.<parameter>` flags. 
+
+Example:
+
+```bash
+helm install my-release . -f high-cache-values.yaml --set config.data.shared_prefix.num_groups=512
+```
+
+## Deployment
+
+This chart supports two main configurations, defined in `high-cache-values.yaml` and `low-cache-values.yaml`.
+
+### 1. Deploying the High-Cache Configuration
+
+This configuration is optimized for scenarios where a high cache hit rate is expected. It uses the `high-cache-values.yaml` file.
+
+```bash
+export IP='<YOUR_IP>'
+export PORT='<YOUR_PORT>'
+helm install high-cache . -f high-cache-values.yaml \
+  --set hfTokenSecret.name=hf-token \
+  --set hfTokenSecret.key=token \
+  --set "config.server.base_url=http://${IP}:${PORT}"
+```
+
+**Parameters to customize:**
+
+*   `high-cache`: A unique name for this deployment.
+*   `hfTokenSecret.name`: The name of your Kubernetes Secret containing the Hugging Face token (default: `hf-token`).
+*   `hfTokenSecret.key`: The key in your Kubernetes Secret pointing to the Hugging Face token (default: `token`).
+*   `config.server.base_url`: The base URL (IP and port) of your inference server for the high-cache scenario.
+
+### 2. Deploying the Low-Cache Configuration
+
+This configuration is designed for scenarios with a lower cache hit rate. It uses the `low-cache-values.yaml` file.
+
+```bash
+export IP='<YOUR_IP>'
+export PORT='<YOUR_PORT>'
+helm install low-cache . -f low-cache-values.yaml \
+  -f high-cache-values.yaml \
+  --set hfTokenSecret.name=hf-token \
+  --set hfTokenSecret.key=token \
+  --set "config.server.base_url=http://${IP}:${PORT}"
+```
+
+**Parameters to customize:**
+
+*   `low-cache`: A unique name for this deployment.
+*   `hfTokenSecret.name`: The name of your Kubernetes Secret containing the Hugging Face token (default: `hf-token`).
+*   `hfTokenSecret.key`: The key in your Kubernetes Secret pointing to the Hugging Face token (default: `token`).
+*   `config.server.base_url`: The base URL (IP and port) of your inference server for the high-cache scenario.
+
+## Uninstalling the Charts
+
+To uninstall the deployed charts:
+
+```bash
+helm uninstall my-high-cache-release
+helm uninstall my-low-cache-release
+```
diff --git a/benchmarking/precise-prefix-cache-aware/high-cache-values.yaml b/benchmarking/precise-prefix-cache-aware/high-cache-values.yaml
@@ -0,0 +1,63 @@
+# High-Cache Configuration
+job:
+  image: "quay.io/inference-perf/inference-perf:latest"
+  memory: "8G"
+
+logLevel: DEBUG
+
+hfTokenSecret:
+  name: hf-token
+  key: token
+
+config:
+  load:
+    type: constant
+    interval: 15
+    stages:
+    - rate: 100
+      duration: 30
+    - rate: 200
+      duration: 30
+    - rate: 300
+      duration: 30
+    - rate: 400
+      duration: 30
+    - rate: 500
+      duration: 30
+    - rate: 600
+      duration: 30
+    - rate: 700
+      duration: 30
+    - rate: 800
+      duration: 30
+    worker_max_concurrency: 1000
+  api:
+    type: completion
+    streaming: true
+  server:
+    type: vllm
+    model_name: meta-llama/Llama-3.1-8B-Instruct
+    base_url: http://0.0.0.0:8000
+    ignore_eos: true
+  tokenizer:
+    pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
+  data:
+    type: shared_prefix
+    shared_prefix:
+      num_groups: 256
+      num_prompts_per_group: 16
+      system_prompt_len: 2048 # High-cache setting
+      question_len: 256      # High-cache setting
+      output_len: 256
+  metrics:
+    type: prometheus
+    prometheus:
+      google_managed: true
+  report:
+    request_lifecycle:
+      summary: true
+      per_stage: true
+      per_request: true
+    prometheus:
+      summary: true
+      per_stage: true
diff --git a/benchmarking/precise-prefix-cache-aware/low-cache-values.yaml b/benchmarking/precise-prefix-cache-aware/low-cache-values.yaml
@@ -0,0 +1,63 @@
+# Low-Cache Configuration
+job:
+  image: "quay.io/inference-perf/inference-perf:latest"
+  memory: "8G"
+
+logLevel: INFO
+
+hfTokenSecret:
+  name: hf-token
+  key: token
+
+config:
+  load:
+    type: constant
+    interval: 15
+    stages:
+    - rate: 100
+      duration: 30
+    - rate: 200
+      duration: 30
+    - rate: 300
+      duration: 30
+    - rate: 400
+      duration: 30
+    - rate: 500
+      duration: 30
+    - rate: 600
+      duration: 30
+    - rate: 700
+      duration: 30
+    - rate: 800
+      duration: 30
+    worker_max_concurrency: 1000
+  api:
+    type: completion
+    streaming: true
+  server:
+    type: vllm
+    model_name: meta-llama/Llama-3.1-8B-Instruct
+    base_url: http://0.0.0.0:8000
+    ignore_eos: true
+  tokenizer:
+    pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
+  data:
+    type: shared_prefix
+    shared_prefix:
+      num_groups: 256
+      num_prompts_per_group: 16
+      system_prompt_len: 256      # Low-cache setting
+      question_len: 2048      # Low-cache setting
+      output_len: 256
+  metrics:
+    type: prometheus
+    prometheus:
+      google_managed: true
+  report:
+    request_lifecycle:
+      summary: true
+      per_stage: true
+      per_request: true
+    prometheus:
+      summary: true
+      per_stage: true
diff --git a/benchmarking/precise-prefix-cache-aware/templates/_helpers.tpl b/benchmarking/precise-prefix-cache-aware/templates/_helpers.tpl
@@ -0,0 +1,72 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "precise-prefix-cache-aware.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "precise-prefix-cache-aware.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "precise-prefix-cache-aware.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "precise-prefix-cache-aware.labels" -}}
+helm.sh/chart: {{ include "precise-prefix-cache-aware.chart" . }}
+{{ include "precise-prefix-cache-aware.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "precise-prefix-cache-aware.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "precise-prefix-cache-aware.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Config Mount Path
+*/}}
+{{- define "precise-prefix-cache-aware.configMount" -}}
+{{- print "/etc/inference-perf" -}}
+{{- end }}
+
+{{/*
+Hugging Face Secret Name
+*/}}
+{{- define "precise-prefix-cache-aware.hfSecret" -}}
+{{- printf "%s-hf-secret" (include "precise-prefix-cache-aware.fullname" .) -}}
+{{- end }}
+
+{{/*
+Hugging Face Secret Key
+*/}}
+{{- define "precise-prefix-cache-aware.hfKey" -}}
+{{- print "token" -}}
+{{- end }}
diff --git a/benchmarking/precise-prefix-cache-aware/templates/configmap.yaml b/benchmarking/precise-prefix-cache-aware/templates/configmap.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "precise-prefix-cache-aware.fullname" . }}-config
+  labels:
+    {{- include "precise-prefix-cache-aware.labels" . | nindent 4 }}
+data:
+  config.yaml: |
+    {{- $config := .Values.config | deepCopy -}}
+    {{- $secretToken := index (lookup "v1" "Secret" .Release.Namespace .Values.hfTokenSecret.name).data .Values.hfTokenSecret.key | b64dec -}}
+    {{- $_ := set $config.tokenizer "token" $secretToken -}}
+    {{- toYaml $config | nindent 4 }}
diff --git a/benchmarking/precise-prefix-cache-aware/templates/job.yaml b/benchmarking/precise-prefix-cache-aware/templates/job.yaml
@@ -0,0 +1,43 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "precise-prefix-cache-aware.fullname" . }}-job
+  labels:
+    {{- include "precise-prefix-cache-aware.labels" . | nindent 4 }}
+    app: inference-perf
+spec:
+  template:
+    metadata:
+      labels:
+        {{- include "precise-prefix-cache-aware.selectorLabels" . | nindent 8 }}
+        app: inference-perf
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: inference-perf-container
+          image: {{ .Values.job.image }}
+          command: ["inference-perf"]
+          args:
+            - "--config_file"
+            - "{{ include "precise-prefix-cache-aware.configMount" . }}/config.yaml"
+            - "--log-level"
+            - {{ .Values.logLevel }}
+          env:
+{{- if .Values.hfToken }}
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "precise-prefix-cache-aware.hfTokenSecret.name" . }}
+                  key: {{ include "precise-prefix-cache-aware.hfTokenSecret.key" . }}
+{{- end }}
+          volumeMounts:
+            - name: config-volume
+              mountPath: {{ include "precise-prefix-cache-aware.configMount" . }}
+              readOnly: true
+          resources:
+            requests:
+              memory: {{ .Values.job.memory }}
+      volumes:
+        - name: config-volume
+          configMap:
+            name: {{ include "precise-prefix-cache-aware.fullname" . }}-config
diff --git a/benchmarking/precise-prefix-cache-aware/templates/secret.yaml b/benchmarking/precise-prefix-cache-aware/templates/secret.yaml
@@ -0,0 +1,11 @@
+{{- if .Values.hfToken -}}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ include "precise-prefix-cache-aware.hfSecret" . }}
+  labels:
+    {{- include "precise-prefix-cache-aware.labels" . | nindent 4 }}
+type: Opaque
+data:
+  token: {{ .Values.hfToken | b64enc }}
+{{- end }}