From 364ca7dc9bf21d13f35948ee1d38120932c7f55a Mon Sep 17 00:00:00 2001
From: Luca Berton <luca@lucaberton.it>
Date: Wed, 9 Jul 2025 14:57:00 +0200
Subject: [PATCH 1/2] Deploy microsoft/Phi-3-mini-4k-instruct model and fix
 HOME and benchmark launch parameter

---
 extra/k8s/inference-benchmarker/templates/vllm.yaml | 2 ++
 extra/k8s/inference-benchmarker/values.yaml         | 9 ++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/extra/k8s/inference-benchmarker/templates/vllm.yaml b/extra/k8s/inference-benchmarker/templates/vllm.yaml
index fb7178d..edf6730 100644
--- a/extra/k8s/inference-benchmarker/templates/vllm.yaml
+++ b/extra/k8s/inference-benchmarker/templates/vllm.yaml
@@ -43,6 +43,8 @@ spec:
                 secretKeyRef:
                   name: {{ include "inference-benchmarker.fullname" . }}-hf-token
                   key: HF_TOKEN
+            - name: HOME
+              value: /tmp
           args:
             - "--model"
             - "{{ .Values.model_id }}"
diff --git a/extra/k8s/inference-benchmarker/values.yaml b/extra/k8s/inference-benchmarker/values.yaml
index 6fd4b20..dcdf73b 100644
--- a/extra/k8s/inference-benchmarker/values.yaml
+++ b/extra/k8s/inference-benchmarker/values.yaml
@@ -3,11 +3,11 @@ nameOverride: ""
 fullnameOverride: ""
 
 hf_token: ""
-model_id: "meta-llama/Llama-3.1-8B-Instruct"
-server: tgi
+model_id: "microsoft/Phi-3-mini-4k-instruct"
+server: vllm
 
 tgi:
-  enabled: true
+  enabled: false
   extra_args:
     - "--max-concurrent-requests"
     - "512"
@@ -28,7 +28,7 @@ tgi:
   affinity: { }
 
 vllm:
-  enabled: false
+  enabled: true
   extra_args:
   image:
     repository: vllm/vllm-openai
@@ -50,7 +50,6 @@ benchmark:
   extra_args:
     - "--profile"
     - "chat"
-    - "800"
   image:
     repository: ghcr.io/huggingface/inference-benchmarker
     pullPolicy: IfNotPresent

From 541066317cc4ce1bd2c8abafd4f7e341b7f75064 Mon Sep 17 00:00:00 2001
From: Luca Berton <luca@lucaberton.it>
Date: Thu, 10 Jul 2025 12:15:48 +0200
Subject: [PATCH 2/2] Add Persisting benchmark results section to documentation

---
 README.md | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a588a72..80a9648 100644
--- a/README.md
+++ b/README.md
@@ -210,11 +210,51 @@ values, sampling token counts from a normal distribution with the specified vari
 
 You can deploy the benchmarking tool on Kubernetes using the provided Helm chart.
 
-Review the values (especially model, HF token and resources), and install the chart:
+Review the `values.yaml` file (especially model, HF token and resources), and install the chart:
 ```shell
 $ helm install inference-benchmarker ./extra/k8s/inference-benchmarker
 ```
 
+## Persisting benchmark results
+
+The default chart mounts the **results** volume with `emptyDir`, so files vanish when the pod terminates.
+Create a **PersistentVolumeClaim** named `results`, using a StorageClass that fits *your* cluster policy (e.g. `gp2`, `rook-ceph`, `shared-rwx`, …), then patch the chart to use it.
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: results
+spec:
+  storageClassName: example      # ← change to your StorageClass
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 5Gi
+```
+
+```bash
+kubectl apply -f pvc.yaml
+```
+
+Edit `templates/benchmark.yaml` (or patch via values) so the volume becomes:
+
+```yaml
+volumes:
+  - name: results
+    persistentVolumeClaim:
+      claimName: results
+```
+
+After each run the benchmark drops its **JSON** reports into the `results` PVC, where you can:
+
+* `kubectl cp` them locally
+* mount the PVC in other pods for post-processing
+
+Deploy, benchmark, and iterate—without losing your data between runs!
+
+
 ## Deploy on Slurm
 
 Slurm example is provided in `extra/slurm`.