Skip to content

Commit 6e8faaa

Browse files
committed
Review feedback and other minor improvements
- keep-failed-namespaces has been changed to delete-failed-namespaces (default false) - A unique work directory is created for each test run to avoid interference - The logs now contain the exact command that was used to run the tests - Script tried to delete already deleted namespaces
1 parent 9a56acc commit 6e8faaa

File tree

2 files changed

+73
-34
lines changed

2 files changed

+73
-34
lines changed

template/scripts/auto-retry-tests.py

Lines changed: 59 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
This script:
66
1. Runs the full test suite initially (with normal cleanup)
77
2. Identifies failed tests and retries them with configurable strategy
8-
3. Manages test namespaces intelligently (cleanup on success, keep failed for debugging)
8+
3. Manages test namespaces intelligently (cleanup on success, keep failed for debugging by default)
99
4. Provides detailed logging and comprehensive reporting
1010
1111
Usage: ./scripts/auto-retry-tests.py --parallel 4 --attempts-serial 3 --attempts-parallel 2 --venv ./venv
@@ -16,6 +16,7 @@
1616
import json
1717
import os
1818
import re
19+
import shutil
1920
import statistics
2021
import subprocess
2122
import sys
@@ -32,8 +33,7 @@ class TestConstants:
3233
"""Constants used throughout the test runner."""
3334

3435
MIN_RUNTIME_THRESHOLD = 30.0 # Filter out quick failures (seconds)
35-
MAX_RUNTIME_HISTORY = 10 # Keep only recent runs
36-
MAX_ERROR_LINES_TO_CHECK = 50 # Lines to scan for errors in logs
36+
MAX_RUNTIME_HISTORY = 50 # Keep only recent runs
3737
MAX_TEST_NAME_LENGTH = 100 # Maximum test name length for filenames
3838
HASH_SUFFIX_LENGTH = 8 # Length of MD5 hash suffix
3939
DEFAULT_PARALLEL_WORKERS = 2 # Default number of parallel workers
@@ -231,7 +231,7 @@ def build_configuration_dict(self, args) -> dict:
231231
"parallel": args.parallel,
232232
"attempts_parallel": args.attempts_parallel,
233233
"attempts_serial": args.attempts_serial,
234-
"keep_failed_namespaces": args.keep_failed_namespaces,
234+
"delete_failed_namespaces": args.delete_failed_namespaces,
235235
"venv": args.venv,
236236
"extra_args": args.extra_args,
237237
"output_dir": str(self.output_dir),
@@ -378,7 +378,7 @@ def create_safe_log_filename(
378378
return f"{safe_test_name}_attempt_{attempt}_{attempt_type}.txt"
379379

380380
def build_test_command(
381-
self, test_name: str = None, skip_delete: bool = False
381+
self, test_name: str = None, skip_delete: bool = False, work_dir: str = None
382382
) -> List[str]:
383383
"""Build the command arguments for running tests."""
384384
command_args = ["scripts/run-tests"]
@@ -393,6 +393,13 @@ def build_test_command(
393393
if test_name:
394394
command_args.extend(["--test", test_name])
395395

396+
# Add unique work directory to prevent parallel interference
397+
# beku deletes the work dir at the start of a test so if a test was already running and then
398+
# another starts the new one would delete (and recreate) the work directory.
399+
# This does lead to failures.
400+
if work_dir:
401+
command_args.extend(["--work-dir", work_dir])
402+
396403
# Add any extra arguments passed through
397404
if self.args.extra_args:
398405
command_args.extend(self.args.extra_args)
@@ -484,7 +491,20 @@ def run_single_test_suite(
484491
attempt_type: str = "initial",
485492
) -> TestResult:
486493
"""Run a single test or the full test suite."""
487-
command_args = self.build_test_command(test_name, skip_delete)
494+
# Create unique work directory to prevent parallel test interference
495+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[
496+
:-3
497+
] # microseconds to milliseconds
498+
if test_name:
499+
# Create a unique work directory based on test name and timestamp
500+
safe_test_name = test_name.replace("/", "_").replace(",", "_")
501+
work_dir = (
502+
f"tests/_work_{safe_test_name}_{attempt}_{attempt_type}_{timestamp}"
503+
)
504+
else:
505+
work_dir = f"tests/_work_full_suite_{attempt}_{attempt_type}_{timestamp}"
506+
507+
command_args = self.build_test_command(test_name, skip_delete, work_dir)
488508

489509
# Set up log file
490510
if test_name:
@@ -519,6 +539,12 @@ def run_single_test_suite(
519539
start_time = time.time()
520540

521541
with open(log_file, "w") as file_handle:
542+
# Log the exact command being executed
543+
file_handle.write(f"Command: {' '.join(command_args)}\n")
544+
file_handle.write(f"Working directory: {os.getcwd()}\n")
545+
file_handle.write("=" * 80 + "\n\n")
546+
file_handle.flush()
547+
522548
result = subprocess.run(
523549
command_args,
524550
stdout=file_handle,
@@ -569,10 +595,18 @@ def run_single_test_suite(
569595
print(f" 📊 Average: {self.format_duration(avg_runtime)}")
570596

571597
# If this was a skip_delete attempt and test passed, clean up the namespace
598+
# We only want to keep failed namespaces
572599
if test_name and skip_delete and success and namespace:
573600
self.delete_test_namespace(namespace)
574601
print(f" 🧹 Test passed, cleaning up namespace: {namespace}")
575602

603+
# Clean up the unique work directory after test completion
604+
if work_dir and Path(work_dir).exists():
605+
try:
606+
shutil.rmtree(work_dir)
607+
except Exception as exception:
608+
print(f" ⚠️ Failed to clean up work directory {work_dir}: {exception}")
609+
576610
return test_result
577611

578612

@@ -727,7 +761,9 @@ def generate_comprehensive_report(self, runner, start_time: datetime) -> str:
727761
report.append(f"Parallel: {runner.args.parallel}")
728762
report.append(f"Parallel retry attempts: {runner.args.attempts_parallel}")
729763
report.append(f"Serial retry attempts: {runner.args.attempts_serial}")
730-
report.append(f"Keep failed namespaces: {runner.args.keep_failed_namespaces}")
764+
report.append(
765+
f"Delete failed namespaces: {runner.args.delete_failed_namespaces}"
766+
)
731767
report.append(f"Virtualenv: {runner.args.venv or 'None'}")
732768
report.append("")
733769

@@ -882,7 +918,7 @@ def retry_tests_in_parallel(
882918
# Determine if this is the last attempt and no serial tests follow
883919
is_last_attempt = attempt == max_attempts
884920
use_skip_delete = (
885-
self.args.keep_failed_namespaces
921+
not self.args.delete_failed_namespaces
886922
and not serial_tests_follow
887923
and is_last_attempt
888924
)
@@ -892,10 +928,6 @@ def retry_tests_in_parallel(
892928
print(
893929
f"Retrying {len(tests_to_retry)} tests in parallel (max {max_parallel} at once)..."
894930
)
895-
if use_skip_delete:
896-
print(
897-
" Using skip-delete for this final parallel attempt (no serial tests follow)"
898-
)
899931

900932
# Execute tests in parallel
901933
with ThreadPoolExecutor(max_workers=max_parallel) as executor:
@@ -952,10 +984,12 @@ def retry_test_serially(
952984
for attempt in range(1, max_attempts + 1):
953985
# Only use skip-delete on the last attempt
954986
is_last_attempt = attempt == max_attempts
987+
use_skip_delete = not self.args.delete_failed_namespaces and is_last_attempt
988+
955989
result = self.test_executor.run_single_test_suite(
956990
self.output_dir,
957991
test_name=test_name,
958-
skip_delete=self.args.keep_failed_namespaces and is_last_attempt,
992+
skip_delete=use_skip_delete,
959993
attempt=attempt,
960994
attempt_type="serial",
961995
)
@@ -992,23 +1026,15 @@ def create_test_summary(
9921026
else:
9931027
final_status = "failed"
9941028

995-
# Find the last namespace for failed tests
1029+
# Find the last namespace for failed tests (only if keeping failed namespaces)
9961030
final_namespace = None
997-
if final_status == "failed":
1031+
if final_status == "failed" and not self.args.delete_failed_namespaces:
9981032
# Keep the last failed attempt's namespace
9991033
for result in reversed(retry_results):
10001034
if result.namespace:
10011035
final_namespace = result.namespace
10021036
break
10031037

1004-
# Clean up namespaces for successful tests
1005-
if final_status in ["passed", "flaky"]:
1006-
# Delete all namespaces for this test
1007-
all_results = [initial_result] + retry_results
1008-
for result in all_results:
1009-
if result.namespace and result.namespace != final_namespace:
1010-
self.test_executor.delete_test_namespace(result.namespace)
1011-
10121038
summary = TestSummary(
10131039
test_name=test_name,
10141040
initial_result=initial_result,
@@ -1078,8 +1104,14 @@ def _run_initial_test_suite(self) -> bool:
10781104
)
10791105

10801106
if not failed_tests:
1081-
print(" No failed tests found in output (this might be a parsing issue)")
1082-
self.report_generator.generate_and_save_final_report(self, self.start_time)
1107+
print(" No failed tests found in output but run-tests exited with code 1")
1108+
print(
1109+
" This indicates an infrastructure or setup issue that prevents tests from running"
1110+
)
1111+
print(
1112+
" Check the log file for connection errors, missing dependencies, or cluster issues"
1113+
)
1114+
print(f" Log file: {initial_result.log_file}")
10831115
return False
10841116

10851117
print(f" Found {len(failed_tests)} failed tests:")
@@ -1281,9 +1313,9 @@ def main():
12811313

12821314
# Namespace management arguments
12831315
parser.add_argument(
1284-
"--keep-failed-namespaces",
1316+
"--delete-failed-namespaces",
12851317
action="store_true",
1286-
help="Keep namespaces of failed tests for debugging (only the last one is kept)",
1318+
help="Delete namespaces of failed tests (default: keep them for debugging)",
12871319
)
12881320

12891321
# Output arguments

template/scripts/run-tests

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,14 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
135135
type=str,
136136
required=False,
137137
)
138+
139+
parser.add_argument(
140+
"--work-dir",
141+
help="Working directory for test generation and execution (default: tests/_work)",
142+
type=str,
143+
required=False,
144+
default=os.path.join("tests", "_work"),
145+
)
138146

139147
return parser.parse_args(argv)
140148

@@ -281,7 +289,6 @@ def maybe_install_release(skip_release: bool, release_file: str) -> None:
281289
if skip_release:
282290
logging.debug("Skip release installation")
283291
return
284-
stackablectl_err = ""
285292
try:
286293
stackablectl_cmd = [
287294
"stackablectl",
@@ -313,7 +320,7 @@ def maybe_install_release(skip_release: bool, release_file: str) -> None:
313320
raise TestRunnerException()
314321

315322

316-
def gen_tests(test_suite: str, namespace: str) -> None:
323+
def gen_tests(test_suite: str, namespace: str, work_dir: str) -> None:
317324
try:
318325
beku_cmd = [
319326
"beku",
@@ -324,7 +331,7 @@ def gen_tests(test_suite: str, namespace: str) -> None:
324331
"--template_dir",
325332
os.path.join("tests", "templates", "kuttl"),
326333
"--output_dir",
327-
os.path.join("tests", "_work"),
334+
work_dir,
328335
]
329336
if test_suite:
330337
beku_cmd.extend(["--suite", test_suite])
@@ -341,7 +348,7 @@ def gen_tests(test_suite: str, namespace: str) -> None:
341348
raise TestRunnerException()
342349

343350

344-
def run_tests(test: str, parallel: int, namespace: str, skip_delete: bool) -> None:
351+
def run_tests(test: str, parallel: int, namespace: str, skip_delete: bool, work_dir: str) -> None:
345352
try:
346353
kuttl_cmd = ["kubectl-kuttl", "test"]
347354
if test:
@@ -359,7 +366,7 @@ def run_tests(test: str, parallel: int, namespace: str, skip_delete: bool) -> No
359366

360367
subprocess.run(
361368
kuttl_cmd,
362-
cwd="tests/_work",
369+
cwd=work_dir,
363370
check=True,
364371
)
365372
except subprocess.CalledProcessError:
@@ -425,13 +432,13 @@ def main(argv) -> int:
425432
opts = parse_args(argv[1:])
426433
logging.basicConfig(encoding="utf-8", level=opts.log_level)
427434
have_requirements()
428-
gen_tests(opts.test_suite, opts.namespace)
435+
gen_tests(opts.test_suite, opts.namespace, opts.work_dir)
429436
with release_file(opts.operator, opts.skip_operator) as f:
430437
maybe_install_release(opts.skip_release, f)
431438
if opts.skip_tests:
432439
logging.info("Skip running tests.")
433440
else:
434-
run_tests(opts.test, opts.parallel, opts.namespace, opts.skip_delete)
441+
run_tests(opts.test, opts.parallel, opts.namespace, opts.skip_delete, opts.work_dir)
435442
except TestRunnerException:
436443
ret = 1
437444
return ret

0 commit comments

Comments
 (0)