Skip to content

Commit 5479fc8

Browse files
authored
Merge pull request #3236 from IntersectMBO/timeout_get_cluster_one_hour
fix(testrun): add timeouts for testrun
2 parents 825794a + 7b94d09 commit 5479fc8

File tree

6 files changed

+124
-17
lines changed

6 files changed

+124
-17
lines changed

.github/regression.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,34 @@ if [ "$(echo "$PWD"/.bin/*)" != "${PWD}/.bin/*" ]; then
207207
echo
208208
fi
209209
210+
# function to monitor system resources and log them every 10 minutes
211+
monitor_system() {
212+
: > monitor.log
213+
214+
while true; do
215+
{
216+
echo "===== $(date) ====="
217+
echo "--- CPU ---"
218+
top -b -n1 | head -5
219+
echo "--- MEM ---"
220+
free -h
221+
echo "--- DISK ---"
222+
df -h .
223+
echo
224+
} >> monitor.log
225+
226+
sleep 600 # 10 minutes
227+
done
228+
}
229+
230+
# start monitor in background
231+
monitor_system &
232+
MON_PID=$!
233+
234+
# ensure cleanup on ANY exit (success, error, Ctrl-C, set -e, etc.)
235+
# shellcheck disable=SC2064
236+
trap "echo 'Stopping monitor'; kill $MON_PID 2>/dev/null || true" EXIT
237+
210238
# Run tests and generate report
211239
212240
# shellcheck disable=SC2046,SC2119

.github/run_tests.sh

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# DESELECT_FROM_FILE: path to file with tests to deselect
2222
# CLUSTERS_COUNT: number of local testnet clusters to launch
2323
# FORBID_RESTART: if set to 1, do not restart clusters between tests
24+
# SESSION_TIMEOUT: overall timeout for the test session (e.g. 10800 for 3 hours)
2425
#
2526
# Notes:
2627
# - If PYTEST_ARGS is provided, we disable cleanup and the initial "skip all" pass.
@@ -51,9 +52,15 @@ All targets respect the same env vars as the original Makefile.
5152
EOF
5253
}
5354

54-
pytest_w_echo() {
55-
echo "Running: PYTEST_ADDOPTS='${PYTEST_ADDOPTS:-}' pytest $*"
56-
pytest "$@"
55+
run_pytest() {
56+
if [ -n "${SESSION_TIMEOUT:-}" ]; then
57+
local -a timeout_arr=( "--signal=INT" "--kill-after=0" "$SESSION_TIMEOUT" )
58+
echo "Running: PYTEST_ADDOPTS='${PYTEST_ADDOPTS:-}' timeout ${timeout_arr[*]} pytest $*"
59+
timeout "${timeout_arr[@]}" pytest "$@"
60+
else
61+
echo "Running: PYTEST_ADDOPTS='${PYTEST_ADDOPTS:-}' pytest $*"
62+
pytest "$@"
63+
fi
5764
}
5865

5966
ensure_dirs() {
@@ -133,7 +140,7 @@ initial_skip_pass() {
133140
}
134141

135142
run_real_tests() {
136-
pytest_w_echo \
143+
run_pytest \
137144
"$TESTS_DIR" \
138145
"${MARKEXPR_ARR[@]}" \
139146
"${DESELECT_FROM_FILE_ARR[@]}" \
@@ -157,41 +164,44 @@ ensure_markexpr_default() {
157164
target_tests() {
158165
export DbSyncAbortOnPanic="${DbSyncAbortOnPanic:-1}"
159166
TEST_THREADS="${TEST_THREADS:-20}"
167+
SESSION_TIMEOUT="${SESSION_TIMEOUT:-10800}"
160168

161169
ensure_dirs
162170
set_common_env
163171
compute_common_args
164172
cleanup_previous_run
165173
initial_skip_pass
166-
run_real_tests "$@"
174+
run_real_tests --timeout=7200 "$@"
167175
}
168176

169177
target_testpr() {
170178
export TESTPR=1
171179
export CLUSTERS_COUNT="${CLUSTERS_COUNT:-5}"
172180
TEST_THREADS="${TEST_THREADS:-20}"
181+
SESSION_TIMEOUT="${SESSION_TIMEOUT:-2700}"
173182
ensure_markexpr_default "smoke"
174183

175184
ensure_dirs
176185
set_common_env
177186
compute_common_args
178187
cleanup_previous_run
179188
initial_skip_pass
180-
run_real_tests "$@"
189+
run_real_tests --timeout=1200 "$@"
181190
}
182191

183192
target_testnets() {
184193
export CLUSTERS_COUNT=1
185194
export FORBID_RESTART=1
186195
TEST_THREADS="${TEST_THREADS:-15}"
196+
SESSION_TIMEOUT="${SESSION_TIMEOUT:-72000}"
187197
ensure_markexpr_default "testnets"
188198

189199
ensure_dirs
190200
set_common_env
191201
compute_common_args
192202
cleanup_previous_run
193203
initial_skip_pass
194-
run_real_tests "$@"
204+
run_real_tests --timeout=7200 "$@"
195205
}
196206

197207
# Dispatch

.github/workflows/regression_reusable.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ jobs:
156156
testrun-report.xml
157157
deselected_tests.txt
158158
requirements_coverage.json
159+
monitor.log
159160
- name: ↟ Upload CLI coverage
160161
uses: actions/upload-artifact@v5
161162
if: success() || failure()

cardano_node_tests/cluster_management/cluster_getter.py

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,15 @@ def __init__(
9999
self.pytest_tmp_dir = temptools.get_pytest_root_tmp()
100100
self.cluster_lock = common.get_cluster_lock_file()
101101

102+
# Soft timeout (seconds): applies when no cluster is selected.
103+
self.grace_period_soft = 3600
104+
# Hard timeout (seconds): always applies, regardless of cluster selection.
105+
self.grace_period_hard = 7200
106+
# Time window (seconds) before deadline when stricter dead cluster checks apply.
107+
self.strict_check_window = 1200
108+
# Maximum allowed fraction of dead clusters during strict check window.
109+
self.strict_dead_fraction = 0.51
110+
102111
self._cluster_instance_num = -1
103112

104113
@property
@@ -564,13 +573,42 @@ def _marked_select_instance(self, cget_status: _ClusterGetStatus) -> bool:
564573
# If here, this will be the first test with the mark
565574
return True
566575

567-
def _fail_on_all_dead(self) -> None:
568-
"""Fail if all cluster instances are dead."""
569-
dead_clusters = status_files.list_cluster_dead_files()
570-
if len(dead_clusters) == self.num_of_instances:
571-
msg = "All clusters are dead, cannot run."
576+
def _check_dead_fraction(self, max_dead_fraction: float) -> None:
577+
"""Fail if the fraction of dead cluster instances is too high."""
578+
total = self.num_of_instances
579+
if total == 0:
580+
msg = "Number of cluster instances must be greater than 0."
581+
raise ValueError(msg)
582+
dead_count = len(status_files.list_cluster_dead_files())
583+
dead_fraction = dead_count / total
584+
585+
if dead_fraction >= max_dead_fraction:
586+
if dead_count == total:
587+
msg = "All cluster instances are dead."
588+
else:
589+
msg = (
590+
"Too many cluster instances are dead: "
591+
f"{dead_count} out of {total} "
592+
f"({dead_fraction:.0%} dead, "
593+
f"maximum allowed: {max_dead_fraction:.0%})."
594+
)
572595
raise RuntimeError(msg)
573596

597+
def _fail_on_dead_clusters(self, remaining_time_sec: float) -> None:
598+
"""Fail based on how many cluster instances are dead and time left.
599+
600+
Use a stricter failure threshold as we approach the deadline.
601+
If we've been waiting a long time and too many cluster instances are dead,
602+
it's better to fail than continue trying with too few usable instances.
603+
"""
604+
if remaining_time_sec <= self.strict_check_window:
605+
max_dead_fraction = self.strict_dead_fraction
606+
else:
607+
# Early in the wait period we only fail if all instances are dead.
608+
max_dead_fraction = 1.0
609+
610+
self._check_dead_fraction(max_dead_fraction)
611+
574612
def _cleanup_dead_clusters(self, cget_status: _ClusterGetStatus) -> None:
575613
"""Cleanup if the selected cluster instance failed to start."""
576614
# Move on to other cluster instance
@@ -805,8 +843,24 @@ def get_cluster_instance( # noqa: C901
805843

806844
self.log(f"want to run test '{cget_status.current_test}'")
807845

808-
# Iterate until it is possible to start the test
846+
# Iterate until it is possible to start the test. Timeout after grace period.
847+
now = time.monotonic()
848+
deadline_soft = now + self.grace_period_soft
849+
deadline_hard = now + self.grace_period_hard
809850
while True:
851+
now = time.monotonic()
852+
remaining_soft = deadline_soft - now
853+
remaining_hard = deadline_hard - now
854+
855+
# Timeout after soft grace period if no cluster instance was selected yet
856+
if cget_status.selected_instance == -1 and remaining_soft <= 0:
857+
msg = "Timeout (soft) while waiting to obtain cluster instance."
858+
raise TimeoutError(msg)
859+
# Timeout after hard grace period even if cluster instance was already selected
860+
if remaining_hard <= 0:
861+
msg = "Timeout (hard) while waiting to obtain cluster instance."
862+
raise TimeoutError(msg)
863+
810864
if cget_status.respin_ready:
811865
self._respin(scriptsdir=scriptsdir)
812866

@@ -819,8 +873,7 @@ def get_cluster_instance( # noqa: C901
819873
if self._is_already_running():
820874
return self.cluster_instance_num
821875

822-
# Fail if all cluster instances are dead
823-
self._fail_on_all_dead()
876+
self._fail_on_dead_clusters(remaining_time_sec=remaining_soft)
824877

825878
if mark:
826879
# Check if tests with my mark are already locked to any cluster instance

poetry.lock

Lines changed: 16 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ PyYAML = "^6.0.2"
4343
requests = "^2.32.4"
4444
pytest-subtests = "^0.14.2"
4545
cardonnay = "^0.2.8"
46+
pytest-timeout = "^2.4.0"
4647

4748
[tool.poetry.group.dev]
4849
optional = true

0 commit comments

Comments
 (0)