Skip to content

Commit 9b7c801

Browse files
[Run] Capture the logs for all pods (from llm-d stack) at the end (#638)
Whenever run is invoked, try to capture the logs from gaie, decode, prefill and inference-gateway pods at the end of a run. The llm-d-benchmark executable now has indepedent try loops for the harness and analyzer. Finally, a few improvements on `preprocess/set_llmdbench_environment.py` Signed-off-by: maugustosilva <maugusto.silva@gmail.com>
1 parent 0d181c8 commit 9b7c801

File tree

6 files changed

+204
-56
lines changed

6 files changed

+204
-56
lines changed

build/llm-d-benchmark.sh

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
#!/usr/bin/env bash
2-
export LLMDBENCH_RUN_EXPERIMENT_HARNESS_EC=1
2+
export LLMDBENCH_RUN_EXPERIMENT_HARNESS_LOADGEN_EC=1
3+
export LLMDBENCH_RUN_EXPERIMENT_HARNESS_REPORT_EC=1
4+
35
export LLMDBENCH_RUN_EXPERIMENT_HARNESS_NAME_AUTO=1
46
export LLMDBENCH_RUN_EXPERIMENT_HARNESS_WORKLOAD_AUTO=1
7+
export LLMDBENCH_RUN_EXPERIMENT_HARNESS_MAX_TRIES=${LLMDBENCH_RUN_EXPERIMENT_HARNESS_MAX_TRIES:-3}
58

69
function show_usage {
710
echo -e "Usage: $0 -l/--harness [harness used to generate load (default=$LLMDBENCH_HARNESS_NAME, possible values $(ls $LLMDBENCH_RUN_WORKSPACE_DIR/profiles/ | sed -n ':a;N;$!ba;s/\n/,/g;p')] \n \
@@ -93,10 +96,10 @@ fi
9396

9497
env | grep ^LLMDBENCH | grep -v BASE64 | sort
9598

96-
# Repeat run until success
99+
97100
echo "Running harness: /usr/local/bin/${LLMDBENCH_RUN_EXPERIMENT_HARNESS}"
98101
counter=1
99-
while [[ $LLMDBENCH_RUN_EXPERIMENT_HARNESS_EC -ne 0 && "${counter}" -le 3 ]]; do
102+
while [[ $LLMDBENCH_RUN_EXPERIMENT_HARNESS_LOADGEN_EC -ne 0 && "${counter}" -le $LLMDBENCH_RUN_EXPERIMENT_HARNESS_MAX_TRIES ]]; do
100103
/usr/local/bin/${LLMDBENCH_RUN_EXPERIMENT_HARNESS}
101104
ec=$?
102105
if [[ $ec -ne 0 ]]; then
@@ -105,7 +108,7 @@ while [[ $LLMDBENCH_RUN_EXPERIMENT_HARNESS_EC -ne 0 && "${counter}" -le 3 ]]; do
105108
counter="$(( ${counter} + 1 ))"
106109
set -x
107110
else
108-
export LLMDBENCH_RUN_EXPERIMENT_HARNESS_EC=0
111+
export LLMDBENCH_RUN_EXPERIMENT_HARNESS_LOADGEN_EC=0
109112
fi
110113
done
111114
echo "Harness completed: /usr/local/bin/${LLMDBENCH_RUN_EXPERIMENT_HARNESS}"
@@ -115,18 +118,23 @@ if [[ -f ~/fixbashrc ]]; then
115118
fi
116119

117120
echo "Running analysis: /usr/local/bin/${LLMDBENCH_RUN_EXPERIMENT_ANALYZER}"
118-
# Try to run analysis twice then give up
121+
counter=1
122+
while [[ $LLMDBENCH_RUN_EXPERIMENT_HARNESS_REPORT_EC -ne 0 && "${counter}" -le $LLMDBENCH_RUN_EXPERIMENT_HARNESS_MAX_TRIES ]]; do
119123
/usr/local/bin/${LLMDBENCH_RUN_EXPERIMENT_ANALYZER}
120124
ec=$?
121125
if [[ $ec -ne 0 ]]; then
122-
echo "execution of /usr/local/bin/${LLMDBENCH_RUN_EXPERIMENT_ANALYZER} failed, wating 120 seconds and trying again"
123-
sleep 120
124-
set -x
125-
/usr/local/bin/${LLMDBENCH_RUN_EXPERIMENT_ANALYZER}
126-
fi
126+
echo "execution of /usr/local/bin/${LLMDBENCH_RUN_EXPERIMENT_ANALYZER} failed, wating 30 seconds and trying again"
127+
sleep 30
128+
counter="$(( ${counter} + 1 ))"
129+
set -x
130+
else
131+
export LLMDBENCH_RUN_EXPERIMENT_HARNESS_REPORT_EC=0
132+
fi
133+
done
134+
127135

128136
if [[ $LLMDBENCH_RUN_EXPERIMENT_HARNESS_NAME_AUTO -eq 0 ]]; then
129137
echo "Done. Data is available at \"$LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR\""
130138
fi
131139
# Return with error code of first iteration of experiment analyzer
132-
exit $ec
140+
exit $((LLMDBENCH_RUN_EXPERIMENT_HARNESS_LOADGEN_EC + LLMDBENCH_RUN_EXPERIMENT_HARNESS_REPORT_EC))

setup/env.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ export LLMDBENCH_VLLM_COMMON_ACCELERATOR_NR=${LLMDBENCH_VLLM_COMMON_ACCELERATOR_
119119
export LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM=${LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM:-1}
120120
export LLMDBENCH_VLLM_COMMON_DATA_PARALLELISM=${LLMDBENCH_VLLM_COMMON_DATA_PARALLELISM:-1}
121121
export LLMDBENCH_VLLM_COMMON_DATA_LOCAL_PARALLELISM=${LLMDBENCH_VLLM_COMMON_DATA_LOCAL_PARALLELISM:-1}
122-
# export LLMDBENCH_VLLM_COMMON_NUM_WORKERS_PARALLELISM=${LLMDBENCH_VLLM_COMMON_NUM_WORKERS_PARALLELISM:-1}
122+
export LLMDBENCH_VLLM_COMMON_NUM_WORKERS_PARALLELISM=${LLMDBENCH_VLLM_COMMON_NUM_WORKERS_PARALLELISM:-1}
123123
export LLMDBENCH_VLLM_COMMON_ACCELERATOR_MEM_UTIL=${LLMDBENCH_VLLM_COMMON_ACCELERATOR_MEM_UTIL:-0.95}
124124
export LLMDBENCH_VLLM_COMMON_CPU_NR=${LLMDBENCH_VLLM_COMMON_CPU_NR:-4}
125125
export LLMDBENCH_VLLM_COMMON_CPU_MEM=${LLMDBENCH_VLLM_COMMON_CPU_MEM:-40Gi}

setup/functions.sh

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -509,20 +509,11 @@ function deploy_harness_config {
509509
announce "✅ Collected analysis for pods with label \"app=${LLMDBENCH_HARNESS_POD_LABEL}\" at: \"${LLMDBENCH_CONTROL_WORK_DIR}/analysis/\""
510510
511511
announce "🗑️ Deleting pods with label \"app=${LLMDBENCH_HARNESS_POD_LABEL}\" for model \"$model\" ..."
512-
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace ${LLMDBENCH_HARNESS_NAMESPACE} delete pod -l app=${LLMDBENCH_HARNESS_POD_LABEL}" \
513-
${LLMDBENCH_CONTROL_DRY_RUN} \
514-
${LLMDBENCH_CONTROL_VERBOSE}
515-
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace ${LLMDBENCH_HARNESS_NAMESPACE} delete pod -l app=llm-d-benchmark-harness" \
512+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace ${LLMDBENCH_HARNESS_NAMESPACE} delete pod -l function=load_generator" \
516513
${LLMDBENCH_CONTROL_DRY_RUN} \
517514
${LLMDBENCH_CONTROL_VERBOSE}
518515
announce "✅ Pods with label \"app=${LLMDBENCH_HARNESS_POD_LABEL}\" for model \"$model\" deleted"
519516
520-
announce "ℹ️ Capturing the current status of all pods in namespace \"$LLMDBENCH_VLLM_COMMON_NAMESPACE\" to ${pod_results_dir}/pod_status.txt..."
521-
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace $LLMDBENCH_VLLM_COMMON_NAMESPACE get pods -o wide > ${pod_results_dir}/pod_status.txt" \
522-
${LLMDBENCH_CONTROL_DRY_RUN} \
523-
${LLMDBENCH_CONTROL_VERBOSE}
524-
announce "✅ Pod status captured."
525-
526517
elif [[ $LLMDBENCH_HARNESS_WAIT_TIMEOUT -eq 0 ]]; then
527518
announce "ℹ️ Harness was started with LLMDBENCH_HARNESS_WAIT_TIMEOUT=0. Will NOT wait for pod \"${LLMDBENCH_HARNESS_POD_LABEL}\" for model \"$model\" to be in \"Completed\" state. The pod can be accessed through \"${LLMDBENCH_CONTROL_KCMD} --namespace ${LLMDBENCH_HARNESS_NAMESPACE} exec -it pod/<POD_NAME> -- bash\""
528519
announce "ℹ️ To list pod names \"${LLMDBENCH_CONTROL_KCMD} --namespace ${LLMDBENCH_HARNESS_NAMESPACE} get pods -l app=${LLMDBENCH_HARNESS_POD_LABEL}\""
@@ -536,6 +527,39 @@ function deploy_harness_config {
536527
}
537528
export -f deploy_harness_config
538529
530+
function capture_pod_logs {
531+
local model=$1
532+
local local_results_dir=$2
533+
534+
local modelid_label=$(model_attribute $model modelid_label)
535+
536+
for i in $(seq 1 "$LLMDBENCH_HARNESS_LOAD_PARALLELISM"); do
537+
pod_results_dir="${local_results_dir}_${i}"
538+
pod_analysis_dir="${local_analysis_dir}_${i}"
539+
540+
announce "ℹ️ Capturing the current status of all pods in namespace \"$LLMDBENCH_VLLM_COMMON_NAMESPACE\" to ${pod_results_dir}/pod_status.txt ..."
541+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace $LLMDBENCH_VLLM_COMMON_NAMESPACE get pods -o wide > ${pod_results_dir}/pod_status.txt" \
542+
${LLMDBENCH_CONTROL_DRY_RUN} \
543+
${LLMDBENCH_CONTROL_VERBOSE}
544+
announce "✅ Pod status captured."
545+
546+
announce "ℹ️ Capturing logs for all pods in namespace \"$LLMDBENCH_VLLM_COMMON_NAMESPACE\" to ${pod_results_dir}/logs/ ..."
547+
mkdir -p ${pod_results_dir}/logs/
548+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace $LLMDBENCH_VLLM_COMMON_NAMESPACE logs --tail=-1 --prefix=true -l llm-d.ai/model=\"$modelid_label\" > ${pod_results_dir}/logs/modelserving_pods.log" \
549+
${LLMDBENCH_CONTROL_DRY_RUN} \
550+
${LLMDBENCH_CONTROL_VERBOSE}
551+
552+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace $LLMDBENCH_VLLM_COMMON_NAMESPACE logs --tail=-1 --prefix=true -l inferencepool=\"${modelid_label}-gaie-epp\" > ${pod_results_dir}/logs/epp_pods.log" \
553+
${LLMDBENCH_CONTROL_DRY_RUN} \
554+
${LLMDBENCH_CONTROL_VERBOSE}
555+
556+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace $LLMDBENCH_VLLM_COMMON_NAMESPACE logs --tail=-1 --prefix=true -l \"app.kubernetes.io/component=inference-gateway\" > ${pod_results_dir}/logs/igw_pods.log" \
557+
${LLMDBENCH_CONTROL_DRY_RUN} \
558+
${LLMDBENCH_CONTROL_VERBOSE}
559+
done
560+
}
561+
export -f capture_pod_logs
562+
539563
function create_harness_pod {
540564
541565
local _podname=$1
@@ -560,6 +584,7 @@ metadata:
560584
namespace: ${LLMDBENCH_HARNESS_NAMESPACE}
561585
labels:
562586
app: ${LLMDBENCH_HARNESS_POD_LABEL}
587+
function: load_generator
563588
spec:
564589
containers:
565590
- name: harness
@@ -825,14 +850,13 @@ export -f generate_profile_parameter_treatments
825850
826851
function cleanup_pre_execution {
827852
announce "🗑️ Deleting pods with label \"${LLMDBENCH_HARNESS_POD_LABEL}\"..."
828-
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace ${LLMDBENCH_HARNESS_NAMESPACE} delete pod -l app=${LLMDBENCH_HARNESS_POD_LABEL} --ignore-not-found" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
853+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace ${LLMDBENCH_HARNESS_NAMESPACE} delete pod -l app=${LLMDBENCH_HARNESS_POD_LABEL},function=load_generator --ignore-not-found" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
829854
# Sanitize the stack name to make it a valid K8s/OpenShift resource name
830855
local LLMDBENCH_HARNESS_SANITIZED_STACK_NAME=$(echo "${LLMDBENCH_HARNESS_STACK_NAME}" | $LLMDBENCH_CONTROL_SCMD 's|[/:]|-|g')
831856
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace ${LLMDBENCH_HARNESS_NAMESPACE} delete job lmbenchmark-evaluate-${LLMDBENCH_HARNESS_SANITIZED_STACK_NAME} --ignore-not-found" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
832857
announce "ℹ️ Done deleting pods with label \"${LLMDBENCH_HARNESS_POD_LABEL}\" (it will be now recreated)"
833858
834859
}
835-
836860
export -f cleanup_pre_execution
837861
838862
function validate_model_name {

0 commit comments

Comments
 (0)