huggingface · lucab85 · Jul 9, 2025 · Jul 10, 2025
diff --git a/README.md b/README.md
@@ -210,11 +210,51 @@ values, sampling token counts from a normal distribution with the specified vari
 
 You can deploy the benchmarking tool on Kubernetes using the provided Helm chart.
 
-Review the values (especially model, HF token and resources), and install the chart:
+Review the `values.yaml` file (especially model, HF token and resources), and install the chart:
 ```shell
 $ helm install inference-benchmarker ./extra/k8s/inference-benchmarker
 ```
 
+## Persisting benchmark results
+
+The default chart mounts the **results** volume with `emptyDir`, so files vanish when the pod terminates.
+Create a **PersistentVolumeClaim** named `results`, using a StorageClass that fits *your* cluster policy (e.g. `gp2`, `rook-ceph`, `shared-rwx`, …), then patch the chart to use it.
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: results
+spec:
+  storageClassName: example      # ← change to your StorageClass
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 5Gi
+```
+
+```bash
+kubectl apply -f pvc.yaml
+```
+
+Edit `templates/benchmark.yaml` (or patch via values) so the volume becomes:
+
+```yaml
+volumes:
+  - name: results
+    persistentVolumeClaim:
+      claimName: results
+```
+
+After each run the benchmark drops its **JSON** reports into the `results` PVC, where you can:
+
+* `kubectl cp` them locally
+* mount the PVC in other pods for post-processing
+
+Deploy, benchmark, and iterate—without losing your data between runs!
+
+
 ## Deploy on Slurm
 
 Slurm example is provided in `extra/slurm`.

diff --git a/extra/k8s/inference-benchmarker/templates/vllm.yaml b/extra/k8s/inference-benchmarker/templates/vllm.yaml
@@ -43,6 +43,8 @@ spec:
                 secretKeyRef:
                   name: {{ include "inference-benchmarker.fullname" . }}-hf-token
                   key: HF_TOKEN
+            - name: HOME
+              value: /tmp
           args:
             - "--model"
             - "{{ .Values.model_id }}"

diff --git a/extra/k8s/inference-benchmarker/values.yaml b/extra/k8s/inference-benchmarker/values.yaml
@@ -3,11 +3,11 @@ nameOverride: ""
 fullnameOverride: ""
 
 hf_token: ""
-model_id: "meta-llama/Llama-3.1-8B-Instruct"
-server: tgi
+model_id: "microsoft/Phi-3-mini-4k-instruct"
+server: vllm
 
 tgi:
-  enabled: true
+  enabled: false
   extra_args:
     - "--max-concurrent-requests"
     - "512"
@@ -28,7 +28,7 @@ tgi:
   affinity: { }
 
 vllm:
-  enabled: false
+  enabled: true
   extra_args:
   image:
     repository: vllm/vllm-openai
@@ -50,7 +50,6 @@ benchmark:
   extra_args:
     - "--profile"
     - "chat"
-    - "800"
   image:
     repository: ghcr.io/huggingface/inference-benchmarker
     pullPolicy: IfNotPresent