99# Many commonly defined values were left blank (default) so that this scenario is applicable to as many environments as possible.
1010
1111# Model parameters
12+ # export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B"
13+ # export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
14+ # export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
15+ # export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"
1216export LLMDBENCH_DEPLOY_MODEL_LIST=" deepseek-ai/DeepSeek-R1-0528"
1317
14-
1518# PVC parameters
1619# Storage class (leave uncommented to automatically detect the "default" storage class)
1720# export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=standard-rwx
1821# export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=shared-vast
1922# export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs
2023export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
2124
22- # gateway configuration
23- # ##### default is istio and NodePort
24- # export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_CLASS_NAME=kgateway
25- # ##### on openshift as alternative to (default) NodePort
26- # export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_SERVICE_TYPE=ClusterIP
27- # ##### if support LoadBalancer
28- # export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_SERVICE_TYPE=LoadBalancer
29-
3025# Routing configuration (via gaie)
3126export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE=" custom-plugins.yaml"
3227export LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS=$( mktemp)
7469
7570# Routing configuration (via modelservice)
7671# export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2 # already the default
77- export LLMDBENCH_LLMD_ROUTINGSIDECAR_DEBUG_LEVEL=1
78- export LLMDBENCH_LLMD_ROUTINGSIDECAR_IMAGE_TAG=v0.4.0
79-
80- export LLMDBENCH_LLMD_IMAGE_TAG=v0.4.0
8172
8273# Affinity to select node with appropriate accelerator (leave uncommented to automatically detect GPU... WILL WORK FOR OpenShift, Kubernetes and GKE)
8374# export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3 # OpenShift
84- export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200 # Kubernetes
75+ # export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200 # Kubernetes
8576# export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-tesla-a100 # GKE
8677# export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-h100-80gb # GKE
8778# export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-L40S # OpenShift
8879# export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-A100-SXM4-80GB # OpenShift
8980# export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu # ANY GPU (useful for Minikube)
9081
91- # Uncomment to request specific network devices
92- # ####export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/roce_gdr
93- # ######export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/ib
94- # export LLMDBENCH_VLLM_COMMON_NETWORK_NR=4
95- export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE=ephemeral-storage
96- export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR=1Ti
97-
9882export LLMDBENCH_VLLM_COMMON_POD_SCHEDULER=custom-binpack-scheduler
9983
10084# Uncomment to use hostNetwork (onlye ONE PODE PER NODE)
@@ -109,8 +93,6 @@ export LLMDBENCH_VLLM_COMMON_POD_SCHEDULER=custom-binpack-scheduler
10993export LLMDBENCH_VLLM_MODELSERVICE_MULTINODE=true
11094
11195# Common parameters across standalone and llm-d (prefill and decode) pods
112- # export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=16000
113- # export LLMDBENCH_VLLM_COMMON_BLOCK_SIZE=64
11496
11597export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ENVVARS_TO_YAML=$( mktemp)
11698cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ENVVARS_TO_YAML
@@ -171,15 +153,17 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_RESOURCE=nvidia
171153# #####export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_RESOURCE=rdma/roce_gdr
172154export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_RESOURCE=rdma/ib
173155export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_NR=1
174- export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EPHEMERAL_STORAGE_NR=1Ti
175- export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_INFERENCE_PORT=8000
156+ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_MEM_UTIL=0.75
157+ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EPHEMERAL_STORAGE=1Ti
176158export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND=custom
177- # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS ="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
159+ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_PREPROCESS =" python3 /setup/preprocess/set_llmdbench_environment.py; source \$ HOME/llmdbench_env.sh"
178160export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS=$( mktemp)
179161cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS
180- find /dev/shm -type f -delete; START_RANK=\$ (( \$ {LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )); exec vllm serve \
162+ REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_PREFILL_PREPROCESS; \
163+ exec vllm serve \
181164 REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
182- --port 8000 \
165+ --served-model-name REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
166+ --port REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_PREFILL_INFERENCE_PORT \
183167 --trust-remote-code \
184168 --disable-uvicorn-access-log \
185169 --data-parallel-hybrid-lb \
@@ -201,7 +185,7 @@ find /dev/shm -type f -delete; START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE
201185 "step_interval":"3000",
202186 "num_redundant_experts":"32",
203187 "log_balancedness":"False"}' \
204- --gpu-memory-utilization 0.75
188+ --gpu-memory-utilization REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_MEM_UTIL
205189EOF
206190
207191export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_CONTAINER_CONFIG=$( mktemp)
@@ -213,28 +197,6 @@ securityContext:
213197 - SYS_RAWIO
214198 runAsGroup: 0
215199 runAsUser: 0
216- # startupProbe:
217- # httpGet:
218- # path: /health
219- # port: 8000
220- # initialDelaySeconds: 0
221- # periodSeconds: 1
222- # timeoutSeconds: 5
223- # failureThreshold: 2700
224- # livenessProbe:
225- # httpGet:
226- # path: /health
227- # port: 8000
228- # periodSeconds: 30
229- # timeoutSeconds: 5
230- # failureThreshold: 3
231- # readinessProbe:
232- # httpGet:
233- # path: /v1/models
234- # port: 8000
235- # periodSeconds: 10
236- # timeoutSeconds: 5
237- # failureThreshold: 3
238200imagePullPolicy: Always
239201EOF
240202
@@ -244,18 +206,24 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_VOLUME_MOUNTS=$(mktemp)
244206cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_VOLUME_MOUNTS}
245207- name: dshm
246208 mountPath: /dev/shm
247- - mountPath: /var/cache/huggingface
248- name: hf-cache
249- - mountPath: /var/cache/vllm
250- name: jit-cache
209+ - name: preprocesses
210+ mountPath: /setup/preprocess
211+ - name: hf-cache
212+ mountPath: /var/cache/huggingface
213+ - name: jit-cache
214+ mountPath: /var/cache/vllm
251215EOF
252216
253217export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_VOLUMES=$( mktemp)
254218cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_VOLUMES}
255219- name: dshm
256220 emptyDir:
257221 medium: Memory
258- sizeLimit: 2Gi # roughly 32MB per local DP plus scratch space
222+ sizeLimit: REPLACE_ENV_LLMDBENCH_VLLM_COMMON_SHM_MEM # roughly 32MB per local DP plus scratch space
223+ - name: preprocesses
224+ configMap:
225+ defaultMode: 320
226+ name: llm-d-benchmark-preprocesses
259227- hostPath:
260228 path: /mnt/local/hf-cache
261229 type: DirectoryOrCreate
@@ -281,17 +249,16 @@ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_RESOURCE=nvidia
281249# #####export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr
282250export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/ib
283251export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=1
284- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EPHEMERAL_STORAGE_NR=1Ti
285- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_INFERENCE_PORT=8200
252+ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EPHEMERAL_STORAGE=1Ti
286253export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=custom
287- # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
254+ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS=" python3 /setup/preprocess/set_llmdbench_environment.py; source \$ HOME/llmdbench_env.sh"
288255export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS=$( mktemp)
289- # Clear /dev/shm on start to prevent running out of space when crashes occur
290- # https://github.com/llm-d/llm-d/issues/352
291256cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
292- find /dev/shm -type f -delete; START_RANK=\$ (( \$ {LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )); exec vllm serve \
257+ REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS; \
258+ exec vllm serve \
293259 REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
294- --port 8200 \
260+ --served-model-name REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
261+ --port REPLACE_ENV_LLMDBENCH_VLLM_COMMON_METRICS_PORT \
295262 --trust-remote-code \
296263 --disable-uvicorn-access-log \
297264 --data-parallel-hybrid-lb \
@@ -372,47 +339,31 @@ securityContext:
372339 - SYS_RAWIO
373340 runAsGroup: 0
374341 runAsUser: 0
375- # startupProbe:
376- # httpGet:
377- # path: /health
378- # port: 8200
379- # initialDelaySeconds: 0
380- # periodSeconds: 1
381- # timeoutSeconds: 5
382- # failureThreshold: 2700
383- # livenessProbe:
384- # httpGet:
385- # path: /health
386- # port: 8200
387- # periodSeconds: 30
388- # timeoutSeconds: 5
389- # failureThreshold: 3
390- # readinessProbe:
391- # httpGet:
392- # path: /v1/models
393- # port: 8200
394- # periodSeconds: 10
395- # timeoutSeconds: 5
396- # failureThreshold: 3
397342imagePullPolicy: Always
398343EOF
399344
400345export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUME_MOUNTS=$( mktemp)
401346cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUME_MOUNTS}
402347- name: dshm
403348 mountPath: /dev/shm
404- - mountPath: /var/cache/huggingface
405- name: hf-cache
406- - mountPath: /var/cache/vllm
407- name: jit-cache
349+ - name: preprocesses
350+ mountPath: /setup/preprocess
351+ - name: hf-cache
352+ mountPath: /var/cache/huggingface
353+ - name: jit-cache
354+ mountPath: /var/cache/vllm
408355EOF
409356
410357export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUMES=$( mktemp)
411358cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUMES}
412359- name: dshm
413360 emptyDir:
414361 medium: Memory
415- sizeLimit: 2Gi # roughly 32MB per local DP plus scratch space
362+ sizeLimit: REPLACE_ENV_LLMDBENCH_VLLM_COMMON_SHM_MEM # roughly 32MB per local DP plus scratch space
363+ - name: preprocesses
364+ configMap:
365+ defaultMode: 320
366+ name: llm-d-benchmark-preprocesses
416367- hostPath:
417368 path: /mnt/local/hf-cache
418369 type: DirectoryOrCreate
0 commit comments