File tree Expand file tree Collapse file tree 1 file changed +4
-5
lines changed
ci/L0_backend_vllm/vllm_backend Expand file tree Collapse file tree 1 file changed +4
-5
lines changed Original file line number Diff line number Diff line change @@ -114,23 +114,22 @@ if [[ "$COUNT" -ne 2 ]]; then
114114 echo " Cmdline parameters verification Failed"
115115fi
116116
117- # Test loading multiple vllm models at the same time
117+ # Test loading multiple vllm models
118118SERVER_ARGS=" --model-repository=$( pwd) /models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_one"
119119SERVER_LOG=" ./vllm_test_multi_model.log"
120120
121121# Create two models, one is just a copy of the other, and make sure gpu
122122# utilization is low enough for multiple models to avoid OOM.
123123# vLLM changed behavior of their GPU profiler from total to free memory,
124- # so to load two small models at the same time, we need to start
125- # triton server in explicit mode, load first model with
126- # `gpu_memory_utilization` 0.4 and second should be 0.9.
124+ # so to load two small models, we need to start
125+ # triton server in explicit mode.
127126MODEL1=" vllm_one"
128127MODEL2=" vllm_two"
129128mkdir -p models
130129cp -r ${SAMPLE_MODELS_REPO} /vllm_model models/${MODEL1} /
131130cp -r models/${MODEL1} models/${MODEL2}
132131sed -i ' s/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/${MODEL1} /1/model.json
133- sed -i ' s/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.9 /' models/${MODEL2} /1/model.json
132+ sed -i ' s/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4 /' models/${MODEL2} /1/model.json
134133
135134run_server
136135if [ " $SERVER_PID " == " 0" ]; then
You can’t perform that action at this time.
0 commit comments