Skip to content

Commit 811a173

Browse files
committed
fix: docs and versions
1 parent 43f8291 commit 811a173

File tree

4 files changed

+11
-22
lines changed

4 files changed

+11
-22
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ MODEL=meta-llama/Llama-3.1-8B-Instruct
6060
HF_TOKEN=<your HF READ token>
6161

6262
docker run --gpus all --shm-size 1g -p 8080:80 -e "HF_TOKEN=$HF_TOKEN" \
63-
ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $MODEL
63+
ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $MODEL
6464
```
6565

6666
**vLLM**

extra/k8s/inference-benchmarker/values.yaml

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
2-
31
imagePullSecrets: [ ]
42
nameOverride: ""
53
fullnameOverride: ""
@@ -50,14 +48,9 @@ vllm:
5048

5149
benchmark:
5250
extra_args:
53-
- "--max-vus"
51+
- "--profile"
52+
- "chat"
5453
- "800"
55-
- "--duration"
56-
- "120s"
57-
- "--warmup"
58-
- "30s"
59-
- "--benchmark-kind"
60-
- "sweep"
6154
image:
6255
repository: ghcr.io/huggingface/inference-benchmarker
6356
pullPolicy: IfNotPresent

extra/slurm/tgi.slurm

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,7 @@ srun --het-group=0 \
4141
--no-container-mount-home \
4242
/usr/local/bin/text-generation-launcher \
4343
--model-id $MODEL \
44-
--max-concurrent-requests 1024 \
45-
--max-waiting-tokens 0 \
46-
--max-batch-prefill-tokens 512&
44+
--max-concurrent-requests 1024&
4745

4846
# wait until /health is available, die after 5 minutes
4947
timeout 600 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for TGI to start...\"; done" || exit 1
@@ -64,16 +62,15 @@ if [[ $exit_code != 124 ]]; then
6462
--no-container-mount-home \
6563
inference-benchmarker \
6664
--tokenizer-name "$MODEL" \
67-
--max-vus 800 \
65+
--max-vus 128 \
6866
--url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \
6967
--duration 120s \
7068
--warmup 30s \
7169
--benchmark-kind rate \
7270
--rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \
73-
--extra-meta "version=$VERSION,engine=TGI,tp=$TP,max_batch_prefill_tokens=512" \
74-
--prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
75-
--decode-options "num_tokens=800,max_tokens=800,min_tokens=800,variance=0" \
76-
--dataset-file share_gpt_cleaned.json \
71+
--extra-meta "version=$VERSION,engine=TGI,tp=$TP" \
72+
--decode-options "num_tokens=800,max_tokens=800,min_tokens=50,variance=100" \
73+
--dataset-file share_gpt_turns.json \
7774
--no-console
7875
fi
7976

extra/slurm/vllm.slurm

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,16 +64,15 @@ if [[ $exit_code != 124 ]]; then
6464
--no-container-mount-home \
6565
inference-benchmarker \
6666
--tokenizer-name "$MODEL" \
67-
--max-vus 800 \
67+
--max-vus 128 \
6868
--url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \
6969
--duration 120s \
7070
--warmup 30s \
7171
--benchmark-kind rate \
7272
--rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \
7373
--extra-meta "version=$VERSION,engine=vLLM,tp=$TP,max_num_batched_tokens=512" \
74-
--prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
75-
--decode-options "num_tokens=800,max_tokens=800,min_tokens=800,variance=0" \
76-
--dataset-file share_gpt_cleaned.json \
74+
--decode-options "num_tokens=800,max_tokens=800,min_tokens=50,variance=100" \
75+
--dataset-file share_gpt_turns.json \
7776
--no-console
7877
fi
7978

0 commit comments

Comments
 (0)