fix: docs and versions

Hugoch · Hugoch · commit 811a17336f0f · 2025-03-28T09:19:11.000+01:00
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ MODEL=meta-llama/Llama-3.1-8B-Instruct
 HF_TOKEN=<your HF READ token>
 
 docker run --gpus all --shm-size 1g -p 8080:80 -e "HF_TOKEN=$HF_TOKEN" \
-    ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $MODEL
+    ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $MODEL
 ```
 
 **vLLM**
diff --git a/extra/k8s/inference-benchmarker/values.yaml b/extra/k8s/inference-benchmarker/values.yaml
@@ -1,5 +1,3 @@
-
-
 imagePullSecrets: [ ]
 nameOverride: ""
 fullnameOverride: ""
@@ -50,14 +48,9 @@ vllm:
 
 benchmark:
   extra_args:
-    - "--max-vus"
+    - "--profile"
+    - "chat"
     - "800"
-    - "--duration"
-    - "120s"
-    - "--warmup"
-    - "30s"
-    - "--benchmark-kind"
-    - "sweep"
   image:
     repository: ghcr.io/huggingface/inference-benchmarker
     pullPolicy: IfNotPresent
diff --git a/extra/slurm/tgi.slurm b/extra/slurm/tgi.slurm
@@ -41,9 +41,7 @@ srun --het-group=0 \
      --no-container-mount-home \
      /usr/local/bin/text-generation-launcher \
       --model-id $MODEL \
-      --max-concurrent-requests 1024 \
-      --max-waiting-tokens 0 \
-      --max-batch-prefill-tokens 512&
+      --max-concurrent-requests 1024&
 
 # wait until /health is available, die after 5 minutes
 timeout 600 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for TGI to start...\"; done" || exit 1
@@ -64,16 +62,15 @@ if [[ $exit_code != 124 ]]; then
          --no-container-mount-home \
          inference-benchmarker \
              --tokenizer-name "$MODEL" \
-             --max-vus 800 \
+             --max-vus 128 \
              --url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \
              --duration 120s \
              --warmup 30s \
              --benchmark-kind rate \
              --rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \
-             --extra-meta "version=$VERSION,engine=TGI,tp=$TP,max_batch_prefill_tokens=512" \
-             --prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
-             --decode-options "num_tokens=800,max_tokens=800,min_tokens=800,variance=0" \
-             --dataset-file share_gpt_cleaned.json \
+             --extra-meta "version=$VERSION,engine=TGI,tp=$TP" \
+             --decode-options "num_tokens=800,max_tokens=800,min_tokens=50,variance=100" \
+             --dataset-file share_gpt_turns.json \
              --no-console
 fi
 
diff --git a/extra/slurm/vllm.slurm b/extra/slurm/vllm.slurm
@@ -64,16 +64,15 @@ if [[ $exit_code != 124 ]]; then
          --no-container-mount-home \
          inference-benchmarker \
              --tokenizer-name "$MODEL" \
-             --max-vus 800 \
+             --max-vus 128 \
              --url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \
              --duration 120s \
              --warmup 30s \
              --benchmark-kind rate \
              --rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \
              --extra-meta "version=$VERSION,engine=vLLM,tp=$TP,max_num_batched_tokens=512" \
-             --prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
-             --decode-options "num_tokens=800,max_tokens=800,min_tokens=800,variance=0" \
-             --dataset-file share_gpt_cleaned.json \
+             --decode-options "num_tokens=800,max_tokens=800,min_tokens=50,variance=100" \
+             --dataset-file share_gpt_turns.json \
              --no-console
 fi