Skip to content

Commit 5ee67a3

Browse files
Ci: Add retry mechanism for flaky gtest execution (#1312)
Create wrapper script to retry failed tests and fix gtest return codes for proper error handling.
1 parent 7366335 commit 5ee67a3

File tree

15 files changed

+439
-148
lines changed

15 files changed

+439
-148
lines changed

.github/scripts/gtest.sh

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
#!/bin/bash
2+
# shellcheck disable=SC2317
3+
4+
script_name=$(basename "${BASH_SOURCE[0]}")
5+
script_path=$(readlink -qe "${BASH_SOURCE[0]}")
6+
script_folder=${script_path/$script_name/}
7+
mtl_folder="${script_folder}/../../"
8+
declare -A test_cases
9+
10+
: "${KAHAWAI_TEST_BINARY:="${mtl_folder}/build/tests/KahawaiTest"}"
11+
: "${KAHAWAI_UFD_TEST_BINARY:="${mtl_folder}/build/tests/KahawaiUfdTest"}"
12+
: "${KAHAWAI_UPL_TEST_BINARY:="${mtl_folder}/build/tests/KahawaiUplTest"}"
13+
: "${MAX_RETRIES:=3}"
14+
: "${RETRY_DELAY:=10}"
15+
: "${LOG_FILE:=$(mktemp /tmp/gtest_log.XXXXXX)}"
16+
: "${EXIT_ON_FAILURE:=1}"
17+
: "${MTL_LD_PRELOAD:=/usr/local/lib/x86_64-linux-gnu/libmtl_udp_preload.so}"
18+
: "${MUFD_CFG:="${mtl_folder}/.github/workflows/upl_gtest.json"}"
19+
: "${NIGHTLY:=1}" # Set to 1 to run full test suite, 0 for quick tests
20+
echo "Log file: $LOG_FILE"
21+
22+
start_time=$(date +%s)
23+
24+
time_taken_by_script() {
25+
local end_time
26+
end_time=$(date +%s)
27+
local elapsed_time=$((end_time - start_time))
28+
local hours=$((elapsed_time / 3600))
29+
local minutes=$(((elapsed_time % 3600) / 60))
30+
local seconds=$((elapsed_time % 60))
31+
32+
echo "=========================================="
33+
echo "Time elapsed: ${hours}h ${minutes}m ${seconds}s"
34+
echo "=========================================="
35+
}
36+
37+
cleanup_on_signal() {
38+
echo "Received termination signal. Cleaning up..."
39+
kill_test_processes
40+
time_taken_by_script
41+
exit 130
42+
}
43+
trap cleanup_on_signal SIGINT SIGTERM
44+
45+
retry_counter=0
46+
47+
generate_test_cases() {
48+
if [ "${NIGHTLY}" -eq 1 ]; then
49+
test_cases["digest_1080p_timeout_interval"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --rss_mode l3_l4 --pacing_way tsc --iova_mode pa --multi_src_port --gtest_filter=*digest_1080p_timeout_interval*"
50+
test_cases["ufd_basic"]="\"${KAHAWAI_UFD_TEST_BINARY}\" --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\""
51+
test_cases["ufd_shared"]="\"${KAHAWAI_UFD_TEST_BINARY}\" --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --queue_mode shared"
52+
test_cases["ufd_shared_lcore"]="\"${KAHAWAI_UFD_TEST_BINARY}\" --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --queue_mode shared --udp_lcore"
53+
test_cases["ufd_rss"]="\"${KAHAWAI_UFD_TEST_BINARY}\" --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --rss_mode l3_l4"
54+
test_cases["udp_ld_preload"]="LD_PRELOAD=\"${MTL_LD_PRELOAD}\" ${KAHAWAI_UPL_TEST_BINARY} --p_sip 192.168.2.80 --r_sip 192.168.2.81"
55+
test_cases["Misc"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=Misc*"
56+
test_cases["Main"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=Main*"
57+
test_cases["Sch"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=Sch*"
58+
test_cases["Dma_va"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --iova_mode va --gtest_filter=Dma*"
59+
test_cases["Dma_pa"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --iova_mode pa --gtest_filter=Dma*"
60+
test_cases["Cvt"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=Cvt*"
61+
test_cases["st2110_20"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=St20*:St21*"
62+
test_cases["st2110_22"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=St22*"
63+
test_cases["st2110_3x"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=St3*"
64+
test_cases["st2110_4x"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=St4*"
65+
test_cases["st20p_auto_pacing_pa"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --rss_mode l3_l4 --pacing_way auto --iova_mode pa --multi_src_port --gtest_filter=Main*:St20p*:-*ext*"
66+
test_cases["st20p_auto_pacing_va"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --rss_mode l3_l4 --pacing_way auto --iova_mode va --multi_src_port --gtest_filter=Main*:St20p*:-*ext*"
67+
test_cases["st20p_tsc_pacing"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --rss_mode l3_l4 --pacing_way tsc --iova_mode va --multi_src_port --gtest_filter=Main*:St20p*:-*ext*"
68+
test_cases["st20p_kernel_loopback"]="\"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port kernel:lo --r_port kernel:lo --gtest_filter=St20p*"
69+
test_cases["noctx"]="\"${mtl_folder}/tests/integration_tests/noctx/run.sh\"" # noctx uses script to run as it needs more setup
70+
else
71+
test_cases["st2110_20"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=St20*:St21*"
72+
test_cases["st2110_22"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=St22*"
73+
test_cases["st2110_3x"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=St3*"
74+
test_cases["st2110_4x"]="sudo -E \"${KAHAWAI_TEST_BINARY}\" --auto_start_stop --p_port \"${TEST_PORT_1}\" --r_port \"${TEST_PORT_2}\" --dma_dev \"${TEST_DMA_PORT_P},${TEST_DMA_PORT_R}\" --gtest_filter=St4*"
75+
fi
76+
}
77+
78+
bind_driver_to_dpdk() {
79+
sudo modprobe ice || true
80+
TEST_PORT_1=$("${mtl_folder}/script/nicctl.sh" list all | awk '$3 == "vfio-pci" {print $2}' | shuf -n 1)
81+
TEST_PORT_2=$("${mtl_folder}/script/nicctl.sh" list all | grep -v "${TEST_PORT_1}" | awk '$3 == "vfio-pci" {print $2}' | shuf -n 1)
82+
TEST_PORT_3=$("${mtl_folder}/script/nicctl.sh" list all | grep -v "${TEST_PORT_1}" | grep -v "${TEST_PORT_2}" | awk '$3 == "vfio-pci" {print $2}' | shuf -n 1)
83+
TEST_PORT_4=$("${mtl_folder}/script/nicctl.sh" list all | grep -v "${TEST_PORT_1}" | grep -v "${TEST_PORT_2}" | grep -v "${TEST_PORT_3}" | awk '$3 == "vfio-pci" {print $2}' | shuf -n 1)
84+
85+
if [ -z "$TEST_PORT_1" ] || [ -z "$TEST_PORT_2" ] || [ -z "$TEST_PORT_3" ] || [ -z "$TEST_PORT_4" ]; then
86+
if [ -z "${pf}" ]; then
87+
pf=$("${mtl_folder}/script/nicctl.sh" list up | awk '$3 == "ice" {print $2}' | shuf -n 1)
88+
fi
89+
90+
echo "Binding PF $pf to DPDK driver"
91+
sudo -E "${mtl_folder}/script/nicctl.sh" create_tvf "$pf"
92+
93+
TEST_PORT_1=$("${mtl_folder}/script/nicctl.sh" list all | awk '$3 == "vfio-pci" {print $2}' | shuf -n 1)
94+
TEST_PORT_2=$("${mtl_folder}/script/nicctl.sh" list all | grep -v "${TEST_PORT_1}" | awk '$3 == "vfio-pci" {print $2}' | shuf -n 1)
95+
TEST_PORT_3=$("${mtl_folder}/script/nicctl.sh" list all | grep -v "${TEST_PORT_1}" | grep -v "${TEST_PORT_2}" | awk '$3 == "vfio-pci" {print $2}' | shuf -n 1)
96+
TEST_PORT_4=$("${mtl_folder}/script/nicctl.sh" list all | grep -v "${TEST_PORT_1}" | grep -v "${TEST_PORT_2}" | grep -v "${TEST_PORT_3}" | awk '$3 == "vfio-pci" {print $2}' | shuf -n 1)
97+
fi
98+
99+
# for the noctx tests
100+
export TEST_PORT_1
101+
export TEST_PORT_2
102+
export TEST_PORT_3
103+
export TEST_PORT_4
104+
105+
if [ ! -f "${mtl_folder}/.github/workflows/upl_gtest_template.json" ]; then
106+
echo "Error: Template file not found: ${mtl_folder}/.github/workflows/upl_gtest_template.json"
107+
time_taken_by_script
108+
exit 1
109+
fi
110+
111+
mkdir -p "$(dirname "$MUFD_CFG")" || true
112+
cp -f "${mtl_folder}/.github/workflows/upl_gtest_template.json" "$MUFD_CFG"
113+
export MUFD_CFG
114+
115+
sed -i "s+REPLACE_BY_CICD_TEST_PORT_1+${TEST_PORT_1}+" "$MUFD_CFG"
116+
sed -i "s+REPLACE_BY_CICD_TEST_PORT_2+${TEST_PORT_2}+" "$MUFD_CFG"
117+
echo "Selected ports: P=$TEST_PORT_1, R=$TEST_PORT_2"
118+
119+
for dma_mechanism in "CBDMA" "idxd"; do
120+
TEST_DMA_PORT_P=$(dpdk-devbind.py -s | grep "$dma_mechanism" | awk '{print $1}' | shuf -n 1)
121+
TEST_DMA_PORT_R=$(dpdk-devbind.py -s | grep "$dma_mechanism" | grep -v "${TEST_DMA_PORT_P}" | awk '{print $1}' | shuf -n 1)
122+
if [ -n "$TEST_DMA_PORT_P" ] && [ -n "$TEST_DMA_PORT_R" ]; then
123+
break
124+
fi
125+
done
126+
127+
if [ -z "$TEST_DMA_PORT_P" ] || [ -z "$TEST_DMA_PORT_R" ]; then
128+
echo "Error: Could not find suitable DPDK DMA devices"
129+
time_taken_by_script
130+
exit 1
131+
fi
132+
133+
generate_test_cases
134+
}
135+
136+
reset_ice_driver() {
137+
echo "Resetting ICE driver..."
138+
sudo modprobe -r ice || true
139+
sleep 5
140+
sudo modprobe ice || true
141+
sleep 10
142+
echo "ICE driver reset completed"
143+
retry_counter=$((retry_counter + 1))
144+
}
145+
146+
kill_test_processes() {
147+
sudo killall -SIGINT KahawaiTest >/dev/null 2>&1 || true
148+
sudo killall -SIGINT KahawaiUfdTest >/dev/null 2>&1 || true
149+
sudo killall -SIGINT KahawaiUplTest >/dev/null 2>&1 || true
150+
sudo killall -SIGINT MtlManager >/dev/null 2>&1 || true
151+
sleep 2
152+
}
153+
154+
start_mtl_manager() {
155+
if ! pgrep -f MtlManager >/dev/null; then
156+
echo "Starting MtlManager..."
157+
sudo MtlManager &
158+
sleep 3
159+
fi
160+
}
161+
162+
# These messages suggest configuration errors that require manual intervention
163+
# If those are found in log just give up immediately
164+
declare -a error_messages=(
165+
"Not a directory"
166+
"mt_user_params_check, same name for port 1 and 0"
167+
"get socket fail -19 for pmd 0"
168+
"EAL: Cannot use IOVA as"
169+
"from LD_PRELOAD cannot be preloaded"
170+
"Error: ufd_parse_json, open json file ufd.json fail"
171+
"libmtl.so: cannot open shared object file:"
172+
"EAL: Cannot set up DMA remapping, error 12 (Cannot allocate memory)"
173+
)
174+
175+
check_configuration_errors() {
176+
for i in "${!error_messages[@]}"; do
177+
if grep -q "${error_messages[$i]}" "$LOG_FILE"; then
178+
echo "✗ Configuration error detected: ${error_messages[$i]}"
179+
return 1
180+
fi
181+
done
182+
return 0
183+
}
184+
185+
watchdog_for_configuration_errors() {
186+
while true; do
187+
sleep 15
188+
if ! check_configuration_errors; then
189+
echo "✗ Configuration error detected by watchdog. Exiting..."
190+
kill_test_processes
191+
time_taken_by_script
192+
exit 1
193+
fi
194+
done
195+
}
196+
197+
watchdog_for_configuration_errors &
198+
199+
run_test_with_retry() {
200+
local test_name="$1"
201+
local attempt=1
202+
203+
echo "=========================================="
204+
echo "Running: $test_name" | tee -a "$LOG_FILE"
205+
echo "Command: ${test_cases[$test_name]}" | tee -a "$LOG_FILE"
206+
echo "=========================================="
207+
208+
while [ $attempt -le "$MAX_RETRIES" ]; do
209+
echo "Attempt $attempt/$MAX_RETRIES for: $test_name"
210+
211+
eval "${test_cases[$test_name]}" 2>&1 | tee -a "$LOG_FILE"
212+
RETVAL=${PIPESTATUS[0]}
213+
if [[ $RETVAL == 0 ]]; then
214+
echo "✓ Test passed: $test_name" | tee -a "$LOG_FILE"
215+
return 0
216+
elif (! check_configuration_errors); then
217+
echo "✗ Test failed due to configuration errors: $test_name (attempt $attempt/$MAX_RETRIES)" | tee -a "$LOG_FILE"
218+
return 2
219+
else
220+
echo "✗ Test failed: $test_name (attempt $attempt/$MAX_RETRIES) | tee -a $LOG_FILE"
221+
222+
kill_test_processes
223+
224+
if [ $attempt -lt "$MAX_RETRIES" ]; then
225+
echo "Waiting $RETRY_DELAY seconds before retry..."
226+
sleep "$RETRY_DELAY"
227+
228+
reset_ice_driver
229+
bind_driver_to_dpdk
230+
fi
231+
232+
start_mtl_manager
233+
((attempt++))
234+
fi
235+
done
236+
237+
echo "✗ Test failed after $MAX_RETRIES attempts: $test_name | tee -a $LOG_FILE"
238+
239+
if [ "$EXIT_ON_FAILURE" -eq 1 ]; then
240+
echo "Exiting due to test failure."
241+
kill_test_processes
242+
time_taken_by_script
243+
exit 1
244+
fi
245+
}
246+
247+
echo "Starting MTL test suite..."
248+
echo "Maximum retries per test: $MAX_RETRIES"
249+
echo "Retry delay: $RETRY_DELAY seconds"
250+
echo "Exit on failure: $EXIT_ON_FAILURE"
251+
echo "MTL_LD_PRELOAD path: $MTL_LD_PRELOAD"
252+
echo "MUFD_CFG path: $MUFD_CFG"
253+
echo ""
254+
255+
kill_test_processes
256+
start_mtl_manager
257+
258+
failed_tests=()
259+
passed_tests=()
260+
261+
if [ -z "$TEST_PORT_1" ] || [ -z "$TEST_PORT_2" ]; then
262+
bind_driver_to_dpdk
263+
fi
264+
265+
if [ -z "$TEST_PORT_1" ] || [ -z "$TEST_PORT_2" ]; then
266+
echo "Error: TEST_PORT_1 or TEST_PORT_2 environment variables are not set"
267+
echo "TEST_PORT_1=$TEST_PORT_1"
268+
echo "TEST_PORT_2=$TEST_PORT_2"
269+
time_taken_by_script
270+
exit 1
271+
fi
272+
273+
if [ -z "$TEST_DMA_PORT_P" ] || [ -z "$TEST_DMA_PORT_R" ]; then
274+
echo "Error: TEST_DMA_PORT_P or TEST_DMA_PORT_R environment variables are not set"
275+
echo "TEST_DMA_PORT_P=$TEST_DMA_PORT_P"
276+
echo "TEST_DMA_PORT_R=$TEST_DMA_PORT_R"
277+
time_taken_by_script
278+
exit 1
279+
fi
280+
281+
generate_test_cases
282+
283+
for test_name in "${!test_cases[@]}"; do
284+
echo "$test_name" "${test_cases[$test_name]}"
285+
if run_test_with_retry "$test_name"; then
286+
passed_tests+=("$test_name")
287+
echo "✓ Test passed: $test_name"
288+
elif [ $? -eq 2 ]; then
289+
echo "✗ Test aborted due to configuration errors: $test_name"
290+
kill_test_processes
291+
time_taken_by_script
292+
exit 1
293+
else
294+
failed_tests+=("$test_name")
295+
break
296+
fi
297+
done
298+
299+
kill_test_processes
300+
301+
if [ ${#passed_tests[@]} -ne 0 ]; then
302+
echo ""
303+
echo "=========================================="
304+
echo "Tests passed:"
305+
for test in "${passed_tests[@]}"; do
306+
echo "$test"
307+
done
308+
echo "=========================================="
309+
fi
310+
311+
if [ ${#failed_tests[@]} -ne 0 ]; then
312+
echo ""
313+
echo "=========================================="
314+
echo "Tests failed:"
315+
for test in "${failed_tests[@]}"; do
316+
echo " - $test"
317+
done
318+
echo "=========================================="
319+
time_taken_by_script
320+
exit 1
321+
fi
322+
323+
time_taken_by_script
324+
exit 0

0 commit comments

Comments
 (0)