diff --git a/benchmarking/precise-prefix-cache-aware/Chart.yaml b/benchmarking/precise-prefix-cache-aware/Chart.yaml new file mode 100644 index 000000000..dc8c14d22 --- /dev/null +++ b/benchmarking/precise-prefix-cache-aware/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +name: precise-prefix-cache-aware +description: A Helm chart for precise-prefix-cache-aware benchmarking +version: 0.1.0 +appVersion: "1.0" diff --git a/benchmarking/precise-prefix-cache-aware/README.md b/benchmarking/precise-prefix-cache-aware/README.md new file mode 100644 index 000000000..0bd020eab --- /dev/null +++ b/benchmarking/precise-prefix-cache-aware/README.md @@ -0,0 +1,94 @@ +# Precise Prefix Cache Aware Benchmarking Helm Chart + +This Helm chart deploys the `inference-perf` benchmarking tool with two distinct configurations: a high-cache scenario and a low-cache scenario. This chart specifically utilizes the **shared prefix dataset** for benchmarking. This guide will walk you through deploying both. + +## Prerequisites + +Before you begin, ensure you have the following: + +* **Helm 3+**: [Installation Guide](https://helm.sh/docs/intro/install/) +* **Kubernetes Cluster**: Access to a Kubernetes cluster +* **Gateway Deployed**: Your inference server/gateway must be deployed and accessible within the cluster. + + +**Hugging Face Token Secret** + +The benchmark requires a Hugging Face token to pull models. Create a Kubernetes Secret named `hf-token` (or a custom name you provide) in your target namespace, containing your Hugging Face token. + + To create this secret: + ```bash + export _HF_TOKEN='' + kubectl create secret generic hf-token --from-literal=token=$_HF_TOKEN + ``` + +## Shared Prefix Dataset Configuration + +The chart uses the `shared_prefix` dataset type, which is designed to test caching efficiency. These parameters are located under config.data.shared_prefix: + +* `num_groups`: The number of shared prefix groups. +* `num_prompts_per_group`: The number of prompts within each shared prefix group. +* `system_prompt_len`: The length of the system prompt. +* `question_len`: The length of the question part of the prompt. +* `output_len`: The desired length of the model's output. + +The default values for the dataset are defined in the chart, but you can override them using `--set config.data.shared_prefix.` flags. + +Example: + +```bash +helm install my-release . -f high-cache-values.yaml --set config.data.shared_prefix.num_groups=512 +``` + +## Deployment + +This chart supports two main configurations, defined in `high-cache-values.yaml` and `low-cache-values.yaml`. + +### 1. Deploying the High-Cache Configuration + +This configuration is optimized for scenarios where a high cache hit rate is expected. It uses the `high-cache-values.yaml` file. + +```bash +export IP='' +export PORT='' +helm install high-cache . -f high-cache-values.yaml \ + --set hfTokenSecret.name=hf-token \ + --set hfTokenSecret.key=token \ + --set "config.server.base_url=http://${IP}:${PORT}" +``` + +**Parameters to customize:** + +* `high-cache`: A unique name for this deployment. +* `hfTokenSecret.name`: The name of your Kubernetes Secret containing the Hugging Face token (default: `hf-token`). +* `hfTokenSecret.key`: The key in your Kubernetes Secret pointing to the Hugging Face token (default: `token`). +* `config.server.base_url`: The base URL (IP and port) of your inference server for the high-cache scenario. + +### 2. Deploying the Low-Cache Configuration + +This configuration is designed for scenarios with a lower cache hit rate. It uses the `low-cache-values.yaml` file. + +```bash +export IP='' +export PORT='' +helm install low-cache . -f low-cache-values.yaml \ + -f high-cache-values.yaml \ + --set hfTokenSecret.name=hf-token \ + --set hfTokenSecret.key=token \ + --set "config.server.base_url=http://${IP}:${PORT}" +``` + +**Parameters to customize:** + +* `low-cache`: A unique name for this deployment. +* `hfTokenSecret.name`: The name of your Kubernetes Secret containing the Hugging Face token (default: `hf-token`). +* `hfTokenSecret.key`: The key in your Kubernetes Secret pointing to the Hugging Face token (default: `token`). +* `config.server.base_url`: The base URL (IP and port) of your inference server for the high-cache scenario. + +## Uninstalling the Charts + +To uninstall the deployed charts: + +```bash +helm uninstall my-high-cache-release +helm uninstall my-low-cache-release +``` diff --git a/benchmarking/precise-prefix-cache-aware/high-cache-values.yaml b/benchmarking/precise-prefix-cache-aware/high-cache-values.yaml new file mode 100644 index 000000000..855f8400c --- /dev/null +++ b/benchmarking/precise-prefix-cache-aware/high-cache-values.yaml @@ -0,0 +1,63 @@ +# High-Cache Configuration +job: + image: "quay.io/inference-perf/inference-perf:latest" + memory: "8G" + +logLevel: DEBUG + +hfTokenSecret: + name: hf-token + key: token + +config: + load: + type: constant + interval: 15 + stages: + - rate: 100 + duration: 30 + - rate: 200 + duration: 30 + - rate: 300 + duration: 30 + - rate: 400 + duration: 30 + - rate: 500 + duration: 30 + - rate: 600 + duration: 30 + - rate: 700 + duration: 30 + - rate: 800 + duration: 30 + worker_max_concurrency: 1000 + api: + type: completion + streaming: true + server: + type: vllm + model_name: meta-llama/Llama-3.1-8B-Instruct + base_url: http://0.0.0.0:8000 + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct + data: + type: shared_prefix + shared_prefix: + num_groups: 256 + num_prompts_per_group: 16 + system_prompt_len: 2048 # High-cache setting + question_len: 256 # High-cache setting + output_len: 256 + metrics: + type: prometheus + prometheus: + google_managed: true + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + prometheus: + summary: true + per_stage: true diff --git a/benchmarking/precise-prefix-cache-aware/low-cache-values.yaml b/benchmarking/precise-prefix-cache-aware/low-cache-values.yaml new file mode 100644 index 000000000..685c39bef --- /dev/null +++ b/benchmarking/precise-prefix-cache-aware/low-cache-values.yaml @@ -0,0 +1,63 @@ +# Low-Cache Configuration +job: + image: "quay.io/inference-perf/inference-perf:latest" + memory: "8G" + +logLevel: INFO + +hfTokenSecret: + name: hf-token + key: token + +config: + load: + type: constant + interval: 15 + stages: + - rate: 100 + duration: 30 + - rate: 200 + duration: 30 + - rate: 300 + duration: 30 + - rate: 400 + duration: 30 + - rate: 500 + duration: 30 + - rate: 600 + duration: 30 + - rate: 700 + duration: 30 + - rate: 800 + duration: 30 + worker_max_concurrency: 1000 + api: + type: completion + streaming: true + server: + type: vllm + model_name: meta-llama/Llama-3.1-8B-Instruct + base_url: http://0.0.0.0:8000 + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct + data: + type: shared_prefix + shared_prefix: + num_groups: 256 + num_prompts_per_group: 16 + system_prompt_len: 256 # Low-cache setting + question_len: 2048 # Low-cache setting + output_len: 256 + metrics: + type: prometheus + prometheus: + google_managed: true + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + prometheus: + summary: true + per_stage: true diff --git a/benchmarking/precise-prefix-cache-aware/templates/_helpers.tpl b/benchmarking/precise-prefix-cache-aware/templates/_helpers.tpl new file mode 100644 index 000000000..9ec07e575 --- /dev/null +++ b/benchmarking/precise-prefix-cache-aware/templates/_helpers.tpl @@ -0,0 +1,72 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "precise-prefix-cache-aware.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "precise-prefix-cache-aware.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "precise-prefix-cache-aware.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "precise-prefix-cache-aware.labels" -}} +helm.sh/chart: {{ include "precise-prefix-cache-aware.chart" . }} +{{ include "precise-prefix-cache-aware.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "precise-prefix-cache-aware.selectorLabels" -}} +app.kubernetes.io/name: {{ include "precise-prefix-cache-aware.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Config Mount Path +*/}} +{{- define "precise-prefix-cache-aware.configMount" -}} +{{- print "/etc/inference-perf" -}} +{{- end }} + +{{/* +Hugging Face Secret Name +*/}} +{{- define "precise-prefix-cache-aware.hfSecret" -}} +{{- printf "%s-hf-secret" (include "precise-prefix-cache-aware.fullname" .) -}} +{{- end }} + +{{/* +Hugging Face Secret Key +*/}} +{{- define "precise-prefix-cache-aware.hfKey" -}} +{{- print "token" -}} +{{- end }} \ No newline at end of file diff --git a/benchmarking/precise-prefix-cache-aware/templates/configmap.yaml b/benchmarking/precise-prefix-cache-aware/templates/configmap.yaml new file mode 100644 index 000000000..0e7ccc4e6 --- /dev/null +++ b/benchmarking/precise-prefix-cache-aware/templates/configmap.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "precise-prefix-cache-aware.fullname" . }}-config + labels: + {{- include "precise-prefix-cache-aware.labels" . | nindent 4 }} +data: + config.yaml: | + {{- $config := .Values.config | deepCopy -}} + {{- $secretToken := index (lookup "v1" "Secret" .Release.Namespace .Values.hfTokenSecret.name).data .Values.hfTokenSecret.key | b64dec -}} + {{- $_ := set $config.tokenizer "token" $secretToken -}} + {{- toYaml $config | nindent 4 }} \ No newline at end of file diff --git a/benchmarking/precise-prefix-cache-aware/templates/job.yaml b/benchmarking/precise-prefix-cache-aware/templates/job.yaml new file mode 100644 index 000000000..100d454ac --- /dev/null +++ b/benchmarking/precise-prefix-cache-aware/templates/job.yaml @@ -0,0 +1,43 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "precise-prefix-cache-aware.fullname" . }}-job + labels: + {{- include "precise-prefix-cache-aware.labels" . | nindent 4 }} + app: inference-perf +spec: + template: + metadata: + labels: + {{- include "precise-prefix-cache-aware.selectorLabels" . | nindent 8 }} + app: inference-perf + spec: + restartPolicy: Never + containers: + - name: inference-perf-container + image: {{ .Values.job.image }} + command: ["inference-perf"] + args: + - "--config_file" + - "{{ include "precise-prefix-cache-aware.configMount" . }}/config.yaml" + - "--log-level" + - {{ .Values.logLevel }} + env: +{{- if .Values.hfToken }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: {{ include "precise-prefix-cache-aware.hfTokenSecret.name" . }} + key: {{ include "precise-prefix-cache-aware.hfTokenSecret.key" . }} +{{- end }} + volumeMounts: + - name: config-volume + mountPath: {{ include "precise-prefix-cache-aware.configMount" . }} + readOnly: true + resources: + requests: + memory: {{ .Values.job.memory }} + volumes: + - name: config-volume + configMap: + name: {{ include "precise-prefix-cache-aware.fullname" . }}-config \ No newline at end of file diff --git a/benchmarking/precise-prefix-cache-aware/templates/secret.yaml b/benchmarking/precise-prefix-cache-aware/templates/secret.yaml new file mode 100644 index 000000000..ef40f9704 --- /dev/null +++ b/benchmarking/precise-prefix-cache-aware/templates/secret.yaml @@ -0,0 +1,11 @@ +{{- if .Values.hfToken -}} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "precise-prefix-cache-aware.hfSecret" . }} + labels: + {{- include "precise-prefix-cache-aware.labels" . | nindent 4 }} +type: Opaque +data: + token: {{ .Values.hfToken | b64enc }} +{{- end }}