Skip to content

Commit 9fdb095

Browse files
Final touches for release v0.4 (#578)
Signed-off-by: maugustosilva <maugusto.silva@gmail.com>
1 parent 0ceb2aa commit 9fdb095

17 files changed

+72
-118
lines changed

build/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ WORKDIR /workspace
3535

3636
ARG INFERENCE_PERF_REPO=https://github.com/kubernetes-sigs/inference-perf.git
3737
ARG INFERENCE_PERF_BRANCH=main
38-
ARG INFERENCE_PERF_COMMIT=e8e0aa99c57f2ffa0912df7ba1fbd2a8a596a041
38+
ARG INFERENCE_PERF_COMMIT=a85b31b5de9fde12b5a0ebaaabb2aee1ccb76657
3939
RUN git clone --branch ${INFERENCE_PERF_BRANCH} ${INFERENCE_PERF_REPO}
4040
RUN cd inference-perf; \
4141
git checkout ${INFERENCE_PERF_COMMIT}; \
@@ -51,7 +51,7 @@ RUN cd vllm; \
5151

5252
ARG GUIDELLM_REPO=https://github.com/vllm-project/guidellm.git
5353
ARG GUIDELLM_BRANCH=main
54-
ARG GUIDELLM_COMMIT=f6175cdd8a88f0931bd46822ed7a71787dcd7cee
54+
ARG GUIDELLM_COMMIT=adfa108ab1df6f2a1452d1037a71817a493303a8
5555
RUN git clone --branch ${GUIDELLM_BRANCH} ${GUIDELLM_REPO}
5656
RUN cd guidellm; \
5757
pip install torch --index-url https://download.pytorch.org/whl/cpu; \

scenarios/examples/spyre.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_RESOURCE=ibm.com/spyre_pf
117117
# export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_NR=0
118118

119119
# Decode parameters: 2 decode pods
120-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1
120+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=2
121121
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=16
122122
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=64Gi
123123
# Uncomment (###) the following line to enable multi-nic

scenarios/guides/inference-scheduling.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
1414
export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
1515
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"
16+
#export LLMDBENCH_DEPLOY_MODEL_LIST="deepseek-ai/DeepSeek-R1-0528"
1617

1718
# PVC parameters
1819
# Storage class (leave uncommented to automatically detect the "default" storage class)

scenarios/guides/pd-disaggregation.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
1414
export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
1515
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"
16+
#export LLMDBENCH_DEPLOY_MODEL_LIST="deepseek-ai/DeepSeek-R1-0528"
1617

1718
# PVC parameters
1819
# Storage class (leave uncommented to automatically detect the "default" storage class)

scenarios/guides/precise-prefix-cache-aware.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
1414
export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
1515
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"
16+
#export LLMDBENCH_DEPLOY_MODEL_LIST="deepseek-ai/DeepSeek-R1-0528"
1617

1718
# PVC parameters
1819
# Storage class (leave uncommented to automatically detect the "default" storage class)

scenarios/guides/simulated-accelerators.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
77
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
88
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"
9+
#export LLMDBENCH_DEPLOY_MODEL_LIST="deepseek-ai/DeepSeek-R1-0528"
910

1011
export LLMDBENCH_VLLM_COMMON_REPLICAS=1
1112

scenarios/guides/tiered-prefix-cache.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
1414
export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
1515
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"
16+
#export LLMDBENCH_DEPLOY_MODEL_LIST="deepseek-ai/DeepSeek-R1-0528"
1617

1718
# PVC parameters
1819
# Storage class (leave uncommented to automatically detect the "default" storage class)

scenarios/guides/wide-ep-lws.sh

Lines changed: 41 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,19 @@
99
# Many commonly defined values were left blank (default) so that this scenario is applicable to as many environments as possible.
1010

1111
# Model parameters
12+
#export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B"
13+
#export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
14+
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
15+
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"
1216
export LLMDBENCH_DEPLOY_MODEL_LIST="deepseek-ai/DeepSeek-R1-0528"
1317

14-
1518
# PVC parameters
1619
# Storage class (leave uncommented to automatically detect the "default" storage class)
1720
#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=standard-rwx
1821
#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=shared-vast
1922
#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs
2023
export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
2124

22-
# gateway configuration
23-
###### default is istio and NodePort
24-
# export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_CLASS_NAME=kgateway
25-
###### on openshift as alternative to (default) NodePort
26-
# export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_SERVICE_TYPE=ClusterIP
27-
###### if support LoadBalancer
28-
# export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_SERVICE_TYPE=LoadBalancer
29-
3025
# Routing configuration (via gaie)
3126
export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE="custom-plugins.yaml"
3227
export LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS=$(mktemp)
@@ -74,27 +69,16 @@ EOF
7469

7570
# Routing configuration (via modelservice)
7671
# export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2 # already the default
77-
export LLMDBENCH_LLMD_ROUTINGSIDECAR_DEBUG_LEVEL=1
78-
export LLMDBENCH_LLMD_ROUTINGSIDECAR_IMAGE_TAG=v0.4.0
79-
80-
export LLMDBENCH_LLMD_IMAGE_TAG=v0.4.0
8172

8273
# Affinity to select node with appropriate accelerator (leave uncommented to automatically detect GPU... WILL WORK FOR OpenShift, Kubernetes and GKE)
8374
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3 # OpenShift
84-
export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200 # Kubernetes
75+
#export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200 # Kubernetes
8576
#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-tesla-a100 # GKE
8677
#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-h100-80gb # GKE
8778
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-L40S # OpenShift
8879
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-A100-SXM4-80GB # OpenShift
8980
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu # ANY GPU (useful for Minikube)
9081

91-
# Uncomment to request specific network devices
92-
#####export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/roce_gdr
93-
#######export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/ib
94-
#export LLMDBENCH_VLLM_COMMON_NETWORK_NR=4
95-
export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE=ephemeral-storage
96-
export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR=1Ti
97-
9882
export LLMDBENCH_VLLM_COMMON_POD_SCHEDULER=custom-binpack-scheduler
9983

10084
# Uncomment to use hostNetwork (onlye ONE PODE PER NODE)
@@ -109,8 +93,6 @@ export LLMDBENCH_VLLM_COMMON_POD_SCHEDULER=custom-binpack-scheduler
10993
export LLMDBENCH_VLLM_MODELSERVICE_MULTINODE=true
11094

11195
# Common parameters across standalone and llm-d (prefill and decode) pods
112-
#export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=16000
113-
#export LLMDBENCH_VLLM_COMMON_BLOCK_SIZE=64
11496

11597
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ENVVARS_TO_YAML=$(mktemp)
11698
cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ENVVARS_TO_YAML
@@ -171,15 +153,17 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_RESOURCE=nvidia
171153
######export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_RESOURCE=rdma/roce_gdr
172154
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_RESOURCE=rdma/ib
173155
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_NR=1
174-
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EPHEMERAL_STORAGE_NR=1Ti
175-
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_INFERENCE_PORT=8000
156+
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_MEM_UTIL=0.75
157+
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EPHEMERAL_STORAGE=1Ti
176158
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND=custom
177-
# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
159+
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
178160
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS=$(mktemp)
179161
cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS
180-
find /dev/shm -type f -delete; START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )); exec vllm serve \
162+
REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_PREFILL_PREPROCESS; \
163+
exec vllm serve \
181164
REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
182-
--port 8000 \
165+
--served-model-name REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
166+
--port REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_PREFILL_INFERENCE_PORT \
183167
--trust-remote-code \
184168
--disable-uvicorn-access-log \
185169
--data-parallel-hybrid-lb \
@@ -201,7 +185,7 @@ find /dev/shm -type f -delete; START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE
201185
"step_interval":"3000",
202186
"num_redundant_experts":"32",
203187
"log_balancedness":"False"}' \
204-
--gpu-memory-utilization 0.75
188+
--gpu-memory-utilization REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_MEM_UTIL
205189
EOF
206190

207191
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_CONTAINER_CONFIG=$(mktemp)
@@ -213,28 +197,6 @@ securityContext:
213197
- SYS_RAWIO
214198
runAsGroup: 0
215199
runAsUser: 0
216-
# startupProbe:
217-
# httpGet:
218-
# path: /health
219-
# port: 8000
220-
# initialDelaySeconds: 0
221-
# periodSeconds: 1
222-
# timeoutSeconds: 5
223-
# failureThreshold: 2700
224-
# livenessProbe:
225-
# httpGet:
226-
# path: /health
227-
# port: 8000
228-
# periodSeconds: 30
229-
# timeoutSeconds: 5
230-
# failureThreshold: 3
231-
# readinessProbe:
232-
# httpGet:
233-
# path: /v1/models
234-
# port: 8000
235-
# periodSeconds: 10
236-
# timeoutSeconds: 5
237-
# failureThreshold: 3
238200
imagePullPolicy: Always
239201
EOF
240202

@@ -244,18 +206,24 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_VOLUME_MOUNTS=$(mktemp)
244206
cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_VOLUME_MOUNTS}
245207
- name: dshm
246208
mountPath: /dev/shm
247-
- mountPath: /var/cache/huggingface
248-
name: hf-cache
249-
- mountPath: /var/cache/vllm
250-
name: jit-cache
209+
- name: preprocesses
210+
mountPath: /setup/preprocess
211+
- name: hf-cache
212+
mountPath: /var/cache/huggingface
213+
- name: jit-cache
214+
mountPath: /var/cache/vllm
251215
EOF
252216

253217
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_VOLUMES=$(mktemp)
254218
cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_VOLUMES}
255219
- name: dshm
256220
emptyDir:
257221
medium: Memory
258-
sizeLimit: 2Gi # roughly 32MB per local DP plus scratch space
222+
sizeLimit: REPLACE_ENV_LLMDBENCH_VLLM_COMMON_SHM_MEM # roughly 32MB per local DP plus scratch space
223+
- name: preprocesses
224+
configMap:
225+
defaultMode: 320
226+
name: llm-d-benchmark-preprocesses
259227
- hostPath:
260228
path: /mnt/local/hf-cache
261229
type: DirectoryOrCreate
@@ -281,17 +249,16 @@ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_RESOURCE=nvidia
281249
######export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr
282250
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/ib
283251
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=1
284-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EPHEMERAL_STORAGE_NR=1Ti
285-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_INFERENCE_PORT=8200
252+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EPHEMERAL_STORAGE=1Ti
286253
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=custom
287-
# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
254+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
288255
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS=$(mktemp)
289-
# Clear /dev/shm on start to prevent running out of space when crashes occur
290-
# https://github.com/llm-d/llm-d/issues/352
291256
cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
292-
find /dev/shm -type f -delete; START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )); exec vllm serve \
257+
REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS; \
258+
exec vllm serve \
293259
REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
294-
--port 8200 \
260+
--served-model-name REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
261+
--port REPLACE_ENV_LLMDBENCH_VLLM_COMMON_METRICS_PORT \
295262
--trust-remote-code \
296263
--disable-uvicorn-access-log \
297264
--data-parallel-hybrid-lb \
@@ -372,47 +339,31 @@ securityContext:
372339
- SYS_RAWIO
373340
runAsGroup: 0
374341
runAsUser: 0
375-
# startupProbe:
376-
# httpGet:
377-
# path: /health
378-
# port: 8200
379-
# initialDelaySeconds: 0
380-
# periodSeconds: 1
381-
# timeoutSeconds: 5
382-
# failureThreshold: 2700
383-
# livenessProbe:
384-
# httpGet:
385-
# path: /health
386-
# port: 8200
387-
# periodSeconds: 30
388-
# timeoutSeconds: 5
389-
# failureThreshold: 3
390-
# readinessProbe:
391-
# httpGet:
392-
# path: /v1/models
393-
# port: 8200
394-
# periodSeconds: 10
395-
# timeoutSeconds: 5
396-
# failureThreshold: 3
397342
imagePullPolicy: Always
398343
EOF
399344

400345
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUME_MOUNTS=$(mktemp)
401346
cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUME_MOUNTS}
402347
- name: dshm
403348
mountPath: /dev/shm
404-
- mountPath: /var/cache/huggingface
405-
name: hf-cache
406-
- mountPath: /var/cache/vllm
407-
name: jit-cache
349+
- name: preprocesses
350+
mountPath: /setup/preprocess
351+
- name: hf-cache
352+
mountPath: /var/cache/huggingface
353+
- name: jit-cache
354+
mountPath: /var/cache/vllm
408355
EOF
409356

410357
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUMES=$(mktemp)
411358
cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUMES}
412359
- name: dshm
413360
emptyDir:
414361
medium: Memory
415-
sizeLimit: 2Gi # roughly 32MB per local DP plus scratch space
362+
sizeLimit: REPLACE_ENV_LLMDBENCH_VLLM_COMMON_SHM_MEM # roughly 32MB per local DP plus scratch space
363+
- name: preprocesses
364+
configMap:
365+
defaultMode: 320
366+
name: llm-d-benchmark-preprocesses
416367
- hostPath:
417368
path: /mnt/local/hf-cache
418369
type: DirectoryOrCreate

setup/env.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@ export LLMDBENCH_VLLM_COMMON_ACCELERATOR_MEMORY="${LLMDBENCH_VLLM_COMMON_ACCELER
100100
export LLMDBENCH_VLLM_COMMON_NAMESPACE="${LLMDBENCH_VLLM_COMMON_NAMESPACE:-llmdbench}"
101101
export LLMDBENCH_VLLM_COMMON_SERVICE_ACCOUNT="${LLMDBENCH_VLLM_COMMON_SERVICE_ACCOUNT:-default}"
102102
export LLMDBENCH_VLLM_COMMON_PULL_SECRET=${LLMDBENCH_VLLM_COMMON_PULL_SECRET:-}
103-
export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE=${LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE:-}
104-
export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR=${LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR:-}
103+
export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE=${LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE:-ephemeral-storage}
104+
export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE=${LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE:-}
105105
export LLMDBENCH_VLLM_COMMON_ACCELERATOR_RESOURCE=${LLMDBENCH_VLLM_COMMON_ACCELERATOR_RESOURCE:-auto}
106106
export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=${LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE:-}
107107
export LLMDBENCH_VLLM_COMMON_NETWORK_NR=${LLMDBENCH_VLLM_COMMON_NETWORK_NR:-}
@@ -163,7 +163,7 @@ export LLMDBENCH_VLLM_STANDALONE_EXTRA_VOLUME_MOUNTS=${LLMDBENCH_VLLM_STANDALONE
163163
export LLMDBENCH_VLLM_STANDALONE_EXTRA_VOLUMES=${LLMDBENCH_VLLM_STANDALONE_EXTRA_VOLUMES:-$LLMDBENCH_VLLM_COMMON_EXTRA_VOLUMES}
164164
export LLMDBENCH_VLLM_STANDALONE_ENVVARS_TO_YAML=${LLMDBENCH_VLLM_STANDALONE_ENVVARS_TO_YAML:-$LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML}
165165
export LLMDBENCH_VLLM_STANDALONE_ARGS=${LLMDBENCH_VLLM_STANDALONE_ARGS:-"REPLACE_ENV_LLMDBENCH_VLLM_STANDALONE_PREPROCESS____;____vllm____serve____REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL____--no-enable-prefix-caching____--load-format____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_VLLM_LOAD_FORMAT____--port____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_INFERENCE_PORT____--max-model-len____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN____--disable-log-requests____--gpu-memory-utilization____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_ACCELERATOR_MEM_UTIL____--tensor-parallel-size____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM____--model-loader-extra-config____\"\$LLMDBENCH_VLLM_STANDALONE_MODEL_LOADER_EXTRA_CONFIG\""}
166-
export LLMDBENCH_VLLM_STANDALONE_EPHEMERAL_STORAGE=${LLMDBENCH_VLLM_STANDALONE_EPHEMERAL_STORAGE:-"20Gi"}
166+
export LLMDBENCH_VLLM_STANDALONE_EPHEMERAL_STORAGE=${LLMDBENCH_VLLM_STANDALONE_EPHEMERAL_STORAGE:-${LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE}}
167167

168168
# Modelservice (helm chart) specific parameters
169169
export LLMDBENCH_VLLM_INFRA_CHART_NAME=${LLMDBENCH_VLLM_INFRA_CHART_NAME:-"llm-d-infra"}
@@ -358,7 +358,7 @@ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=${LLMDBENCH_VLLM_MODELSERVI
358358
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR:-$LLMDBENCH_VLLM_COMMON_CPU_NR}
359359
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM:-$LLMDBENCH_VLLM_COMMON_CPU_MEM}
360360
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_SHM_MEM=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_SHM_MEM:-$LLMDBENCH_VLLM_COMMON_SHM_MEM}
361-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EPHEMERAL_STORAGE_NR=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EPHEMERAL_STORAGE_NR:-$LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR}
361+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EPHEMERAL_STORAGE=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EPHEMERAL_STORAGE:-$LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE}
362362
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS:-true}
363363
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND:-vllmServe}
364364
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS:-"[--disable-log-requests____--max-model-len____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN____--tensor-parallel-size____REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM]"}
@@ -383,7 +383,7 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_NR=${LLMDBENCH_VLLM_MODELSERV
383383
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_CPU_NR=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_CPU_NR:-$LLMDBENCH_VLLM_COMMON_CPU_NR}
384384
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_CPU_MEM=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_CPU_MEM:-$LLMDBENCH_VLLM_COMMON_CPU_MEM}
385385
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_SHM_MEM=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_SHM_MEM:-$LLMDBENCH_VLLM_COMMON_SHM_MEM}
386-
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EPHEMERAL_STORAGE_NR=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EPHEMERAL_STORAGE_NR:-$LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR}
386+
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EPHEMERAL_STORAGE=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EPHEMERAL_STORAGE:-$LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE}
387387
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_PREPROCESS=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_PREPROCESS:-true}
388388
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND:-vllmServe}
389389
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS:-"[--disable-log-requests____--max-model-len____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN____--tensor-parallel-size____REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM]"}

0 commit comments

Comments
 (0)