From 364ca7dc9bf21d13f35948ee1d38120932c7f55a Mon Sep 17 00:00:00 2001 From: Luca Berton Date: Wed, 9 Jul 2025 14:57:00 +0200 Subject: [PATCH 1/2] Deploy microsoft/Phi-3-mini-4k-instruct model and fix HOME and benchmark launch parameter --- extra/k8s/inference-benchmarker/templates/vllm.yaml | 2 ++ extra/k8s/inference-benchmarker/values.yaml | 9 ++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/extra/k8s/inference-benchmarker/templates/vllm.yaml b/extra/k8s/inference-benchmarker/templates/vllm.yaml index fb7178d..edf6730 100644 --- a/extra/k8s/inference-benchmarker/templates/vllm.yaml +++ b/extra/k8s/inference-benchmarker/templates/vllm.yaml @@ -43,6 +43,8 @@ spec: secretKeyRef: name: {{ include "inference-benchmarker.fullname" . }}-hf-token key: HF_TOKEN + - name: HOME + value: /tmp args: - "--model" - "{{ .Values.model_id }}" diff --git a/extra/k8s/inference-benchmarker/values.yaml b/extra/k8s/inference-benchmarker/values.yaml index 6fd4b20..dcdf73b 100644 --- a/extra/k8s/inference-benchmarker/values.yaml +++ b/extra/k8s/inference-benchmarker/values.yaml @@ -3,11 +3,11 @@ nameOverride: "" fullnameOverride: "" hf_token: "" -model_id: "meta-llama/Llama-3.1-8B-Instruct" -server: tgi +model_id: "microsoft/Phi-3-mini-4k-instruct" +server: vllm tgi: - enabled: true + enabled: false extra_args: - "--max-concurrent-requests" - "512" @@ -28,7 +28,7 @@ tgi: affinity: { } vllm: - enabled: false + enabled: true extra_args: image: repository: vllm/vllm-openai @@ -50,7 +50,6 @@ benchmark: extra_args: - "--profile" - "chat" - - "800" image: repository: ghcr.io/huggingface/inference-benchmarker pullPolicy: IfNotPresent From 541066317cc4ce1bd2c8abafd4f7e341b7f75064 Mon Sep 17 00:00:00 2001 From: Luca Berton Date: Thu, 10 Jul 2025 12:15:48 +0200 Subject: [PATCH 2/2] Add Persisting benchmark results section to documentation --- README.md | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a588a72..80a9648 100644 --- a/README.md +++ b/README.md @@ -210,11 +210,51 @@ values, sampling token counts from a normal distribution with the specified vari You can deploy the benchmarking tool on Kubernetes using the provided Helm chart. -Review the values (especially model, HF token and resources), and install the chart: +Review the `values.yaml` file (especially model, HF token and resources), and install the chart: ```shell $ helm install inference-benchmarker ./extra/k8s/inference-benchmarker ``` +## Persisting benchmark results + +The default chart mounts the **results** volume with `emptyDir`, so files vanish when the pod terminates. +Create a **PersistentVolumeClaim** named `results`, using a StorageClass that fits *your* cluster policy (e.g. `gp2`, `rook-ceph`, `shared-rwx`, …), then patch the chart to use it. + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: results +spec: + storageClassName: example # ← change to your StorageClass + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi +``` + +```bash +kubectl apply -f pvc.yaml +``` + +Edit `templates/benchmark.yaml` (or patch via values) so the volume becomes: + +```yaml +volumes: + - name: results + persistentVolumeClaim: + claimName: results +``` + +After each run the benchmark drops its **JSON** reports into the `results` PVC, where you can: + +* `kubectl cp` them locally +* mount the PVC in other pods for post-processing + +Deploy, benchmark, and iterate—without losing your data between runs! + + ## Deploy on Slurm Slurm example is provided in `extra/slurm`.