22log_sum=" log/service_model_device.txt"
33
44model_ids=(" TinyLlama/TinyLlama-1.1B-Chat-v1.0" ) # "facebook/opt-1.3b" "huggyllama/llama-7b")
5- num_devices=(2)
5+ num_lpu_devices=(2) # 4
6+ num_gpu_devices=(0)
67
78current_datetime=$( date " +%Y-%m-%d %H:%M:%S" )
89echo " $current_datetime "
910echo " $current_datetime " >> ${log_sum}
1011
1112# LLMEngine Test
1213for model_id in " ${model_ids[@]} " ; do
13- for num_device in " ${num_devices[@]} " ; do
14+ for num_lpu_device in " ${num_lpu_devices[@]} " ; do
15+ for num_gpu_device in " ${num_gpu_devices[@]} " ; do
1416 # IFS='\' read -ra parts <<< "$model_id"
1517 # model_name="${parts[-1]}"
1618 model_name=$( echo " $model_id " | awk -F' /' ' {print $NF}' )
1719 echo " *********************************"
18- echo " **** Start inference_${model_name} _${num_device } "
20+ echo " **** Start inference_${model_name} _${num_lpu_device} _ ${num_gpu_device }"
1921 echo " *********************************"
20- python lpu_inference_arg.py -m ${model_id} -n ${num_device} > log/inference_${model_name} _${num_device } .txt
22+ python lpu_inference_arg.py -m ${model_id} -l ${num_lpu_device} -g ${num_gpu_device} > log/inference_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt
2123 echo " *********************************" >> ${log_sum}
22- echo " [Testbench] The Result of log/inference_${model_name} _${num_device } .txt" >> ${log_sum}
23- tail -n 1 " log/inference_${model_name} _${num_device } .txt" >> ${log_sum}
24+ echo " [Testbench] The Result of log/inference_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
25+ tail -n 1 " log/inference_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
2426 echo " " >> ${log_sum}
27+ done
2528 done
2629done
2730
2831# LLMEngineAsync Test with vLLM serve
2932for model_id in " ${model_ids[@]} " ; do
30- for num_device in " ${num_devices[@]} " ; do
33+ for num_lpu_device in " ${num_lpu_devices[@]} " ; do
34+ for num_gpu_device in " ${num_gpu_devices[@]} " ; do
3135 model_name=$( echo " $model_id " | awk -F' /' ' {print $NF}' )
3236 echo " *********************************"
33- echo " **** Start serving_${model_name} _${num_device } "
37+ echo " **** Start serving_${model_name} _${num_lpu_device} _ ${num_gpu_device }"
3438 echo " *********************************"
35- python -m vllm.entrypoints.api_server --model ${model_id} --device fpga --tensor-parallel-size ${num_device } &
39+ python -m vllm.entrypoints.api_server --model ${model_id} --device fpga --num-lpu-devices ${num_lpu_device} --num-gpu-devices ${num_gpu_device } &
3640
3741 # Waiting for server
3842 while ! nc -z localhost " 8000" ; do
@@ -41,7 +45,7 @@ for model_id in "${model_ids[@]}"; do
4145 done
4246 echo " [Testbench] The server is ready!"
4347
44- python lpu_client.py > log/vllm_serve_${model_name} _${num_device } .txt
48+ python lpu_client.py > log/vllm_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt
4549
4650 # Waiting for process kill
4751 PID=$( jobs -p | tail -n 1)
@@ -60,22 +64,24 @@ for model_id in "${model_ids[@]}"; do
6064
6165 # Write log in text file
6266 echo " *********************************" >> ${log_sum}
63- echo " The Result of log/vllm_serve_${model_name} _${num_device } .txt" >> ${log_sum}
64- tail -n 1 " log/vllm_serve_${model_name} _${num_device } .txt" >> ${log_sum}
67+ echo " The Result of log/vllm_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
68+ tail -n 1 " log/vllm_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
6569 echo " " >> ${log_sum}
70+ done
6671 done
6772done
6873
6974
7075
7176# OpenAI API Test
7277model_id=${model_ids[0]}
73- num_device=${num_devices[0]}
78+ num_lpu_device=${num_lpu_devices[0]}
79+ num_gpu_device=${num_gpu_devices[0]}
7480model_name=$( echo " $model_id " | awk -F' /' ' {print $NF}' )
7581echo " *********************************"
76- echo " **** Start serving_${model_name} _${num_device } "
82+ echo " **** Start serving_${model_name} _${num_lpu_device} _ ${num_gpu_device }"
7783echo " *********************************"
78- python -m vllm.entrypoints.api_server --model ${model_id} --device fpga --tensor-parallel-size ${num_device } &
84+ python -m vllm.entrypoints.openai. api_server --model ${model_id} --device fpga --num-lpu-devices ${num_lpu_device} --num_gpu_devices ${num_gpu_device } &
7985
8086# Waiting for server
8187while ! nc -z localhost " 8000" ; do
@@ -84,7 +90,7 @@ while ! nc -z localhost "8000"; do
8490done
8591echo " [Testbench] The server is ready!"
8692
87- python lpu_openai_completion_client.py > log/openai_serve_${model_name} _${num_device } .txt
93+ python lpu_openai_completion_client.py > log/openai_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt
8894
8995# Waiting for process kill
9096PID=$( jobs -p | tail -n 1)
103109
104110# Write log in text file
105111echo " *********************************" >> ${log_sum}
106- echo " The Result of log/openai_serve_${model_name} _${num_device } .txt" >> ${log_sum}
107- tail -n 1 " log/openai_serve_${model_name} _${num_device } .txt" >> ${log_sum}
112+ echo " The Result of log/openai_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
113+ tail -n 1 " log/openai_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
108114echo " " >> ${log_sum}
109-
110-
0 commit comments